diff --git a/.github/actions/openvino_provider/action.yml b/.github/actions/openvino_provider/action.yml index dd1078bb0d4353..a17986f35d3723 100644 --- a/.github/actions/openvino_provider/action.yml +++ b/.github/actions/openvino_provider/action.yml @@ -177,7 +177,7 @@ runs: else ov_package_url=$(curl -s ${{ inputs.nightly_package_source }} | jq -r '.${{ inputs.platform }}_${{ inputs.arch }}') fi - cd ${{ inputs.install_dir || env.GITHUB_WORKSPACE }} + cd ${{ inputs.install_dir || github.workspace }} package_basename=$(basename $ov_package_url) wget $ov_package_url --progress=bar:force:noscroll -O $package_basename package_folder=${package_basename%.*} @@ -196,7 +196,7 @@ runs: uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: ${{ steps.openvino_s3_download.outputs.ov_artifact_name }} - path: ${{ steps.openvino_s3_download.outputs.ov_package_path }} + path: ${{ github.workspace }}/${{ steps.openvino_s3_download.outputs.ov_package_path }} if-no-files-found: 'error' - name: Get wheel diff --git a/.github/actions/openvino_provider/get_s3_package.py b/.github/actions/openvino_provider/get_s3_package.py index df253a422421ec..02ea99cb2f3403 100644 --- a/.github/actions/openvino_provider/get_s3_package.py +++ b/.github/actions/openvino_provider/get_s3_package.py @@ -54,6 +54,10 @@ def main(product, version_pattern, platform, arch, folder): matching_files = filter_files_by_criteria(all_files, product, version_pattern, platform, arch, folder) if matching_files: logger.info(f"Matching packages: {sorted(matching_files)}") + if len(matching_files) > 1: + custom_release_build_pattern = fr".*/{version_pattern}/(linux_|windows_|macos_).*/.*" + # Exclude custom release builds, if any, from matches + matching_files = [file for file in matching_files if not re.search(custom_release_build_pattern, file)] package_url = f"https://storage.openvinotoolkit.org{sorted(matching_files)[-1]}" logger.info(f"Returning package URL: {package_url}") action_utils.set_github_output("package_url", package_url) diff --git a/.github/components.yml b/.github/components.yml index 8de51a2ced3343..74247e1f051cd5 100644 --- a/.github/components.yml +++ b/.github/components.yml @@ -149,6 +149,7 @@ PyTorch_FE: build: - CPU - Python_API + - TOKENIZERS # PyTorch_FE tests depend on tokenizers build JAX_FE: revalidate: diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag index 3783a7e8d5600a..bcfa07fb5c24b3 100644 --- a/.github/dockerfiles/docker_tag +++ b/.github/dockerfiles/docker_tag @@ -1 +1 @@ -pr-27430 +pr-27597 diff --git a/.github/dockerfiles/ov_test/debian_10_py310/Dockerfile b/.github/dockerfiles/ov_test/debian_10_py310/Dockerfile new file mode 100644 index 00000000000000..e7dbadf5a414ba --- /dev/null +++ b/.github/dockerfiles/ov_test/debian_10_py310/Dockerfile @@ -0,0 +1,76 @@ +ARG REGISTRY="docker.io" +FROM ${REGISTRY}/library/debian:10.13 + +USER root + +# APT configuration +RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \ + echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf + +ENV DEBIAN_FRONTEND="noninteractive" \ + TZ="Europe/London" + +RUN apt-get update && \ + apt-get install \ + git \ + libc6-dev \ + # parallel gzip + pigz \ + # Python + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + python3-distutils \ + # To build Python 3.10 from source + build-essential \ + libffi-dev \ + libgdbm-dev \ + libc6-dev \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libxml2-dev \ + libxmlsec1-dev \ + liblzma-dev \ + wget \ + curl \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install openvino dependencies +ADD scripts/install_dependencies/install_openvino_dependencies.sh /install_openvino_dependencies.sh +RUN chmod +x /install_openvino_dependencies.sh && \ + /install_openvino_dependencies.sh && \ + rm -rf /var/lib/apt/lists/* + +# Setup Python 3.10 +RUN wget https://www.python.org/ftp/python/3.10.9/Python-3.10.9.tar.xz + +RUN tar -xf Python-3.10.9.tar.xz && \ + cd Python-3.10.9 && \ + ./configure --enable-optimizations && \ + make -j 8 && \ + make altinstall + +# Setup pip +ENV PIP_VERSION="24.0" +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3.10 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + rm -f get-pip.py + +# Use Python 3.10 as default instead of Python 3.7 +# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build +RUN python3.10 -m venv venv +ENV PATH="/venv/bin:$PATH" + +ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION} +ENV PIP_INSTALL_PATH=/venv/lib/python3.10/site-packages diff --git a/.github/dockerfiles/ov_test/fedora_33/Dockerfile b/.github/dockerfiles/ov_test/fedora_33/Dockerfile index c059c82c7d3cf2..6e0fcc7d35156b 100644 --- a/.github/dockerfiles/ov_test/fedora_33/Dockerfile +++ b/.github/dockerfiles/ov_test/fedora_33/Dockerfile @@ -6,7 +6,13 @@ USER root RUN yum update -y && yum install -y \ git \ curl \ - python3 + python3 \ + findutils \ + ocl-icd \ + ocl-icd-devel \ + # parallel gzip + pigz \ + xz # Install Node ENV NODE_VERSION=21.7.3 diff --git a/.github/dockerfiles/ov_test/ubuntu_20_04_x64_py313/Dockerfile b/.github/dockerfiles/ov_test/ubuntu_20_04_x64_py313/Dockerfile new file mode 100644 index 00000000000000..b6b99f81305dee --- /dev/null +++ b/.github/dockerfiles/ov_test/ubuntu_20_04_x64_py313/Dockerfile @@ -0,0 +1,52 @@ +ARG REGISTRY="docker.io" +FROM ${REGISTRY}/library/ubuntu:20.04 + +USER root + +# APT configuration +RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \ + echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \ + echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf + +ENV DEBIAN_FRONTEND="noninteractive" \ + TZ="Europe/London" + +RUN apt-get update && \ + apt-get install software-properties-common && \ + add-apt-repository --yes --no-update ppa:git-core/ppa && \ + add-apt-repository --yes --no-update ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install \ + curl \ + git \ + gpg-agent \ + tzdata \ + # parallel gzip + pigz \ + # Python + python3.13-dev \ + python3.13-venv \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install openvino dependencies +ADD scripts/install_dependencies/install_openvino_dependencies.sh /install_openvino_dependencies.sh +RUN chmod +x /install_openvino_dependencies.sh && \ + /install_openvino_dependencies.sh && \ + rm -rf /var/lib/apt/lists/* + +# Setup pip +ENV PIP_VERSION="24.0" +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + python3.13 get-pip.py --no-cache-dir pip==${PIP_VERSION} && \ + rm -f get-pip.py + +# Use Python 3.13 as default instead of Python 3.8 +# Using venv here 'cause other methods to switch the default Python on Ubuntu 20 break both system and wheels build +RUN python3.13 -m venv venv +ENV PATH="/venv/bin:$PATH" + +ENV PIP_CACHE_DIR=/mount/caches/pip/linux/${PIP_VERSION} +ENV PIP_INSTALL_PATH=/venv/lib/python3.13/site-packages diff --git a/.github/workflows/debian_10_arm.yml b/.github/workflows/debian_10_arm.yml index 73426222253adb..cf628d12c29b89 100644 --- a/.github/workflows/debian_10_arm.yml +++ b/.github/workflows/debian_10_arm.yml @@ -49,7 +49,7 @@ jobs: Docker: needs: Smart_CI if: "!needs.smart_ci.outputs.skip_workflow" - runs-on: aks-linux-16-cores-arm-docker-build + runs-on: aks-linux-4-cores-8gb-arm-docker-build container: image: openvinogithubactions.azurecr.io/docker_build:0.2 volumes: @@ -75,7 +75,7 @@ jobs: if: "!needs.smart_ci.outputs.skip_workflow" uses: ./.github/workflows/job_build_linux.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.debian_10_arm }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} event-name: ${{ github.event_name }} @@ -104,7 +104,7 @@ jobs: needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_cxx_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_arm }} affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'debian_10' @@ -116,6 +116,8 @@ jobs: needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_cpu_functional_tests.yml with: + # Additional investigation needed why CPU functional tests are failing on v6 VM size's version, + # so leave it as it is for now runner: 'aks-linux-16-cores-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_arm }} python-version: '3.7' diff --git a/.github/workflows/send_workflows_to_opentelemetry.yml b/.github/workflows/export_workflow_metrics.yml similarity index 56% rename from .github/workflows/send_workflows_to_opentelemetry.yml rename to .github/workflows/export_workflow_metrics.yml index ba38d6a9f90fed..084dfbdc34af7f 100644 --- a/.github/workflows/send_workflows_to_opentelemetry.yml +++ b/.github/workflows/export_workflow_metrics.yml @@ -1,40 +1,38 @@ -name: Export workflow metrics (BETA) +name: Export workflow metrics on: workflow_run: workflows: - - Android ARM64 with vcpkg - - Android x64 - - Documentation - - Cleanup PIP caches - - Code snippets - - Code Style - - Code coverage - - Coverity (Ubuntu 20.04, Python 3.11) - - Debian 10 ARM - - Fedora 29 (RHEL 8.4), Python 3.9 - - Linux (Ubuntu 20.04, Python 3.9) - - Linux (Ubuntu 22.04, Python 3.11) - - Linux (Ubuntu 24.04, Python 3.12) - - Linux ARM64 (Ubuntu 20.04, Python 3.11) - - Linux Static CC (Ubuntu 22.04, Python 3.11, Clang) - - Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10) - - macOS (Python 3.11) - - macOS ARM64 (Python 3.11) - - MO - - Python API Checks - - Webassembly - - Windows (VS 2019, Python 3.11, Release) - - Windows (VS 2019, Python 3.11, Debug) - - Windows Conditional Compilation (VS 2022, Python 3.11) - - Rerun Workflow with Known Errors + - "Android ARM64 with vcpkg" + - "Android x64" + - "Cleanup caches" + - "Coverity (Ubuntu 20.04, Python 3.11)" + - "Debian 10 ARM" + - "Fedora 29 (RHEL 8.4), Python 3.9" + - "Linux (Ubuntu 20.04, Python 3.9)" + - "Linux (Ubuntu 22.04, Python 3.11)" + - "Linux (Ubuntu 24.04, Python 3.12)" + - "Linux ARM64 (Ubuntu 20.04, Python 3.11)" + - "Linux Static CC (Ubuntu 22.04, Python 3.11, Clang)" + - "Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10)" + - "Linux (Ubuntu 22.04, Python 3.11, Intel DPC\\+\\+ Compiler)" + - "Linux CPU Plugin Snippets with LIBXSMM (Ubuntu 20.04)" + - "Linux Sanitizers (Ubuntu 20.04, Python 3.9)" + - "macOS (Python 3.11)" + - "macOS ARM64 (Python 3.11)" + - "Manylinux 2014" + - "Webassembly" + - "Windows (VS 2019, Python 3.11, Release)" + - "Windows (VS 2019, Python 3.11, Debug)" + - "Windows Conditional Compilation (VS 2022, Python 3.11)" + - "Rerun Workflow with Known Errors" types: - completed permissions: read-all jobs: - otel-export-trace: + export-workflow-metrics: name: Export finished workflow metrics runs-on: aks-linux-2-cores-8gb if: ${{ github.repository_owner == 'openvinotoolkit' }} diff --git a/.github/workflows/job_python_api_tests.yml b/.github/workflows/job_python_api_tests.yml new file mode 100644 index 00000000000000..541a14e2b1b6df --- /dev/null +++ b/.github/workflows/job_python_api_tests.yml @@ -0,0 +1,142 @@ +name: Python API tests + +on: + workflow_call: + inputs: + runner: + description: 'Machine on which the tests would run' + type: string + required: true + container: + description: 'JSON to be converted to the value of the "container" configuration for the job' + type: string + required: false + default: '{"image": null}' + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true + +permissions: read-all + +env: + PIP_CACHE_PATH: /mount/caches/pip/linux + +jobs: + Python_Unit_Tests: + name: Python API tests + timeout-minutes: 30 + runs-on: ${{ inputs.runner }} + container: ${{ fromJSON(inputs.container) }} + defaults: + run: + shell: bash + env: + DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input + OPENVINO_REPO: ${{ github.workspace }}/openvino + INSTALL_DIR: ${{ github.workspace }}/install + INSTALL_TEST_DIR: ${{ github.workspace }}/install/openvino_tests + INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/openvino_wheels + steps: + - name: Download OpenVINO artifacts (tarballs and wheels) + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + pattern: openvino_@(wheels|tests) + path: ${{ env.INSTALL_DIR }} + + # Needed as ${{ github.workspace }} is not working correctly when using Docker + - name: Setup Variables + run: | + echo "OPENVINO_REPO=$GITHUB_WORKSPACE/openvino" >> "$GITHUB_ENV" + echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" + echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/openvino_tests" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/openvino_wheels" >> "$GITHUB_ENV" + + - name: Install OpenVINO dependencies (mac) + if: runner.os == 'macOS' + run: brew install pigz + + - name: Extract OpenVINO packages + run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_TEST_DIR} + working-directory: ${{ env.INSTALL_TEST_DIR }} + + - name: Fetch setup_python and install wheels actions + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + with: + sparse-checkout: | + .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml + sparse-checkout-cone-mode: false + path: 'action_root' + + - name: Setup Python ${{ inputs.python-version }} + uses: ./action_root/.github/actions/setup_python + with: + version: ${{ inputs.python-version }} + pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} + should-setup-pip-paths: ${{ runner.os == 'Linux' }} + self-hosted-runner: ${{ runner.os == 'Linux' }} + + # + # Tests + # + - name: Install OpenVINO Python wheels + uses: ./action_root/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' + + - name: Install Python API tests dependencies + run: python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt + + # + # Tests + # + + - name: Python API Tests + run: | + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \ + --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ + --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py + + - name: Python API Tests -- numpy>=2.0.0 + run: | + python3 -m pip uninstall -y numpy + python3 -m pip install "numpy~=2.0.0" + python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/bindings/python/requirements_test.txt + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 -m pytest -sv ${INSTALL_TEST_DIR}/tests/pyopenvino \ + --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph_new_numpy.xml \ + --ignore=${INSTALL_TEST_DIR}/tests/pyopenvino/tests/test_utils/test_utils.py + + - name: Clone API snippets + if: runner.os != 'macOS' + uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 + with: + sparse-checkout: docs/articles_en/assets/snippets + path: ${{ env.OPENVINO_REPO }} + submodules: 'false' + + - name: Docs Python snippets + if: runner.os != 'macOS' + run: | + # torch, onnx + python3 -m pip install -r ${INSTALL_TEST_DIR}/tests/python/preprocess/torchvision/requirements.txt -r ${INSTALL_TEST_DIR}/tests/requirements_onnx + # to find 'snippets' module in docs + export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets + # for 'template' extension + export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}/tests/:$LD_LIBRARY_PATH + python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py + + - name: Upload Test Results + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + if: ${{ !cancelled() }} + with: + name: test-results-python-api-${{ inputs.python-version }} + path: | + ${{ env.INSTALL_TEST_DIR }}/TEST*.html + ${{ env.INSTALL_TEST_DIR }}/TEST*.xml + if-no-files-found: 'warn' diff --git a/.github/workflows/job_python_unit_tests.yml b/.github/workflows/job_python_unit_tests.yml index 8075f3299fe063..47506c83bf0945 100644 --- a/.github/workflows/job_python_unit_tests.yml +++ b/.github/workflows/job_python_unit_tests.yml @@ -65,21 +65,22 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "LAYER_TESTS_INSTALL_DIR=$GITHUB_WORKSPACE/install/tests/layer_tests" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV" - name: Install OpenVINO dependencies (mac) if: runner.os == 'macOS' run: brew install pigz - name: Extract OpenVINO packages - run: | - pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} + run: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - - name: Fetch setup_python action + - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml sparse-checkout-cone-mode: false path: 'action_root' @@ -92,11 +93,10 @@ jobs: self-hosted-runner: ${{ runner.os == 'Linux' }} - name: Install OpenVINO Python wheels - run: | - # Install the core OV wheel - python3 -m pip install ./openvino-*.whl - - working-directory: ${{ env.INSTALL_WHEELS_DIR }} + uses: ./action_root/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' - name: Install Python API tests dependencies run: | @@ -121,15 +121,6 @@ jobs: # Tests # - - name: Python API Tests - if: ${{ fromJSON(inputs.affected-components).Python_API.test }} - run: | - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \ - --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ - --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py - - name: Python ONNX operators tests if: (fromJSON(inputs.affected-components).Python_API.test || fromJSON(inputs.affected-components).ONNX_FE.test) && @@ -185,35 +176,6 @@ jobs: TEST_DEVICE: CPU TEST_PRECISION: FP16 - - name: Clone API snippets - if: runner.os != 'macOS' - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - sparse-checkout: docs/articles_en/assets/snippets - path: ${{ env.OPENVINO_REPO }} - submodules: 'false' - - - name: Docs Python snippets - if: runner.os != 'macOS' - run: | - # to find 'snippets' module in docs - export PYTHONPATH=${OPENVINO_REPO}/docs/articles_en/assets - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 ${OPENVINO_REPO}/docs/articles_en/assets/snippets/main.py - - - name: Python API Tests -- numpy>=2.0.0 - if: ${{ fromJSON(inputs.affected-components).Python_API.test }} - run: | - python3 -m pip uninstall -y numpy - python3 -m pip install "numpy>=2.0.0,<2.2.0" - python3 -m pip install -r ${INSTALL_TEST_DIR}/bindings/python/requirements_test.txt - # for 'template' extension - export LD_LIBRARY_PATH=${INSTALL_TEST_DIR}:$LD_LIBRARY_PATH - python3 -m pytest -sv ${INSTALL_TEST_DIR}/pyopenvino \ - --junitxml=${INSTALL_TEST_DIR}/TEST-Pyngraph.xml \ - --ignore=${INSTALL_TEST_DIR}/pyopenvino/tests/test_utils/test_utils.py - - name: Upload Test Results uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 if: ${{ !cancelled() }} diff --git a/.github/workflows/job_samples_tests.yml b/.github/workflows/job_samples_tests.yml index e144aa0cfb95aa..6f95d316abfc3f 100644 --- a/.github/workflows/job_samples_tests.yml +++ b/.github/workflows/job_samples_tests.yml @@ -54,6 +54,7 @@ jobs: echo "INSTALL_DIR=$GITHUB_WORKSPACE/install" >> "$GITHUB_ENV" echo "INSTALL_TEST_DIR=$GITHUB_WORKSPACE/install/tests" >> "$GITHUB_ENV" echo "BUILD_DIR=$GITHUB_WORKSPACE/build" >> "$GITHUB_ENV" + echo "INSTALL_WHEELS_DIR=$GITHUB_WORKSPACE/install/wheels" >> "$GITHUB_ENV" - name: Install OpenVINO dependencies (mac) if: runner.os == 'macOS' @@ -65,13 +66,12 @@ jobs: pigz -dc openvino_tests.tar.gz | tar -xf - -C ${INSTALL_DIR} working-directory: ${{ env.INSTALL_DIR }} - - name: Fetch setup_python action - # Python is already installed on Ubuntu within Dockerfile - if: runner.os != 'Linux' + - name: Fetch setup_python and install wheels actions uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: sparse-checkout: | .github/actions/setup_python/action.yml + .github/actions/install_ov_wheels/action.yml sparse-checkout-cone-mode: false path: 'openvino' @@ -113,6 +113,12 @@ jobs: # Tests # + - name: Install OpenVINO Python wheels + uses: ./openvino/.github/actions/install_ov_wheels + with: + wheels-dir-path: ${{ env.INSTALL_WHEELS_DIR }} + wheels-to-install: 'openvino' + - name: Samples tests if: fromJSON(inputs.affected-components).samples.test run: | @@ -122,7 +128,7 @@ jobs: export SHARE=$INSTALL_TEST_DIR/smoke_tests/samples_smoke_tests_data # Install Python benchmark_app by installing openvino-*.whl - python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt $INSTALL_WHEELS_DIR/openvino-*.whl + python3 -m pip install --ignore-installed PyYAML -r $INSTALL_TEST_DIR/smoke_tests/requirements.txt export LD_LIBRARY_PATH=${IE_APP_PATH}:$LD_LIBRARY_PATH source ${INSTALL_DIR}/setupvars.sh diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 10de6867c7d0e2..e1aaa886d631c7 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -53,7 +53,7 @@ jobs: Docker: needs: Smart_CI if: "!needs.smart_ci.outputs.skip_workflow" - runs-on: aks-linux-16-cores-arm-docker-build + runs-on: aks-linux-4-cores-8gb-arm-docker-build container: image: openvinogithubactions.azurecr.io/docker_build:0.2 volumes: @@ -78,7 +78,7 @@ jobs: needs: [ Docker, Smart_CI ] uses: ./.github/workflows/job_build_linux.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} event-name: ${{ github.event_name }} @@ -105,7 +105,7 @@ jobs: if: ${{ 'false' }} uses: ./.github/workflows/job_debian_packages.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-4-cores-8gb-arm' image: 'openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04' Samples: @@ -113,7 +113,7 @@ jobs: if: fromJSON(needs.smart_ci.outputs.affected_components).samples uses: ./.github/workflows/job_samples_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} @@ -123,7 +123,7 @@ jobs: if: fromJSON(needs.smart_ci.outputs.affected_components).JS_API uses: ./.github/workflows/job_openvino_js.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-4-cores-8gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}"}' ONNX_Runtime: @@ -133,7 +133,7 @@ jobs: needs: [ Build, Smart_CI, Docker ] uses: ./.github/workflows/job_onnx_runtime.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' sccache-azure-key-prefix: 'ubuntu20_aarch64_onnxruntime' @@ -142,7 +142,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_tokenizers.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} @@ -154,7 +154,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_cxx_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }} affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'ubuntu_20_04' @@ -164,17 +164,27 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_python_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-16-cores-arm' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' + python-version: '3.11' + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ] uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' @@ -184,7 +194,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' @@ -195,6 +205,8 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_cpu_functional_tests.yml with: + # Additional investigation needed why CPU functional tests are failing on v6 VM size's version, + # so leave it as it is for now runner: 'aks-linux-16-cores-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }} python-version: '3.11' @@ -207,7 +219,7 @@ jobs: needs: [ Build, Docker, Smart_CI, Openvino_tokenizers] uses: ./.github/workflows/job_tensorflow_models_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}"}' model_scope: 'precommit' @@ -218,7 +230,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_models_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}"}' model_scope: 'precommit' diff --git a/.github/workflows/linux_conditional_compilation.yml b/.github/workflows/linux_conditional_compilation.yml index 27f54da6ecdc60..ce78a9f3ae63b7 100644 --- a/.github/workflows/linux_conditional_compilation.yml +++ b/.github/workflows/linux_conditional_compilation.yml @@ -212,7 +212,6 @@ jobs: tar -cvf - \ tests/ov_cpu_func_tests \ tests/libopenvino_template_extension.so \ - tests/libze_loader.so* \ tests/libhwloc* \ tests/libtbb* \ tests/functional_test_utils/layer_tests_summary/* \ diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index c587c5ad7323b3..26289e969c4e00 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -151,6 +151,7 @@ jobs: -DENABLE_CPPLINT=OFF \ -DENABLE_NCC_STYLE=OFF \ -DENABLE_TESTS=ON \ + -DENABLE_WHEEL=OFF \ -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ -DENABLE_STRICT_DEPENDENCIES=OFF \ -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \ @@ -168,7 +169,6 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | @@ -179,6 +179,48 @@ jobs: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd + # Setup additional Python versions for wheels building + - name: Setup Python 3.9 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.9" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.10 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.10" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.12 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.12" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Build additional Python wheels + run: | + for py_version in "3.9" "3.10" "3.11" "3.12" + do + python_exec_path=$(python$py_version -c "import sys; print(sys.executable)") + $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt + + cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version + cmake --build ${{ github.workspace }}/py$py_version --parallel + cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels + done + + # Setup Python 3.11 as the default one + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: ./openvino/.github/actions/setup_python + with: + version: ${{ env.PYTHON_VERSION }} + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + - name: Cmake & Build - OpenVINO Contrib run: | cmake \ @@ -199,6 +241,7 @@ jobs: cmake --build ${{ env.BUILD_DIR }} --parallel $(nproc) cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR_JS }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake + # # Upload build artifacts # @@ -210,7 +253,7 @@ jobs: name: openvino_package path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz if-no-files-found: 'error' - + - name: Upload openvino wheels uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: @@ -270,6 +313,19 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'mac_13' + Python_API_Tests: + name: Python API tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + python-version: [ '3.9', '3.10', '3.11', '3.12' ] + with: + runner: 'macos-13' + python-version: ${{ matrix.python-version }} + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + Python_Unit_Tests: name: Python unit tests needs: [ Build, Smart_CI ] diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 0708a844fe6b8b..d3fb10082adfd4 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -151,6 +151,7 @@ jobs: -DENABLE_CPPLINT=OFF \ -DENABLE_NCC_STYLE=OFF \ -DENABLE_TESTS=ON \ + -DENABLE_WHEEL=OFF \ -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ -DENABLE_STRICT_DEPENDENCIES=OFF \ -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \ @@ -168,7 +169,6 @@ jobs: run: | cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }} -P ${{ env.BUILD_DIR }}/cmake_install.cmake cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_TEST_DIR }} -DCOMPONENT=tests -P ${{ env.BUILD_DIR }}/cmake_install.cmake - cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_WHEELS_DIR }} -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake - name: Pack Artifacts run: | @@ -180,6 +180,48 @@ jobs: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz popd + # Setup additional Python versions for wheels building + - name: Setup Python 3.9 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.9" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.10 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.10" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Setup Python 3.12 + uses: ./openvino/.github/actions/setup_python + with: + version: "3.12" + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + + - name: Build additional Python wheels + run: | + for py_version in "3.9" "3.10" "3.11" "3.12" + do + python_exec_path=$(python$py_version -c "import sys; print(sys.executable)") + $python_exec_path -m pip install -r ${{ env.OPENVINO_REPO }}/src/bindings/python/wheel/requirements-dev.txt + + cmake -DPython3_EXECUTABLE=$python_exec_path -DENABLE_WHEEL=ON -DOpenVINODeveloperPackage_DIR=${{ env.BUILD_DIR }} -S ${{ env.OPENVINO_REPO }}/src/bindings/python -B ${{ github.workspace }}/py$py_version + cmake --build ${{ github.workspace }}/py$py_version --parallel + cmake --install ${{ github.workspace }}/py$py_version --config ${{ env.CMAKE_BUILD_TYPE }} --prefix ${{ env.INSTALL_WHEELS_DIR }} --component python_wheels + done + + # Setup Python 3.11 as the default one + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: ./openvino/.github/actions/setup_python + with: + version: ${{ env.PYTHON_VERSION }} + should-setup-pip-paths: 'false' + self-hosted-runner: 'false' + - name: Cmake & Build - OpenVINO Contrib run: | cmake \ @@ -279,6 +321,19 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + python-version: [ '3.9', '3.10', '3.11', '3.12' ] + with: + runner: 'macos-13-xlarge' + python-version: ${{ matrix.python-version }} + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Build, Smart_CI, Openvino_tokenizers ] diff --git a/.github/workflows/manylinux_2014.yml b/.github/workflows/manylinux_2014.yml index bd5da965226a50..aa0b06b6cf05bd 100644 --- a/.github/workflows/manylinux_2014.yml +++ b/.github/workflows/manylinux_2014.yml @@ -69,6 +69,11 @@ jobs: images: | ov_build/ubuntu_22_04_x64_docker ov_build/manylinux2014_x86_64 + ov_test/ubuntu_20_04_x64_py313 + ov_test/ubuntu_22_04_x64 + ov_test/ubuntu_24_04_x64 + ov_test/fedora_33 + ov_test/debian_10_py310 registry: 'openvinogithubactions.azurecr.io' dockerfiles_root_dir: '.github/dockerfiles' changed_components: ${{ needs.smart_ci.outputs.changed_components }} @@ -92,6 +97,7 @@ jobs: OPENVINO_REPO: ${{ github.workspace }}/src INSTALL_DIR: ${{ github.workspace }}/install/openvino INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels + INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests BUILD_DIR: ${{ github.workspace }}/build DOCKER_CONFIG: "/mount/.docker" CMAKE_CXX_COMPILER_LAUNCHER: sccache @@ -135,6 +141,7 @@ jobs: -v ${{ env.OPENVINO_REPO }}:/work/src \ -v ov_build_cache:/work/build \ -v ${{ env.INSTALL_DIR }}:/work/install \ + -v ${{ env.INSTALL_TEST_DIR }}:/work/api_tests \ -e SCCACHE_AZURE_BLOB_CONTAINER \ -e SCCACHE_AZURE_CONNECTION_STRING \ -e SCCACHE_SERVER_PORT \ @@ -148,16 +155,18 @@ jobs: -w /work/src \ ${{ fromJSON(needs.docker.outputs.images).ov_build.manylinux2014_x86_64 }} \ /bin/bash -c " - cmake -DENABLE_CPPLINT=OFF -DENABLE_NCC_STYLE=OFF -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_PYTHON=OFF -DENABLE_WHEEL=OFF -S /work/src -B /work/build && + python3.12 -m pip install -r /work/src/src/bindings/python/wheel/requirements-dev.txt + cmake -DPython3_EXECUTABLE=/usr/local/bin/python3.12 -DENABLE_CPPLINT=OFF -DENABLE_NCC_STYLE=OFF -DENABLE_TESTS=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_OV_TF_FRONTEND=OFF -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF -DENABLE_OV_PYTORCH_FRONTEND=ON -DENABLE_OV_JAX_FRONTEND=OFF -DENABLE_OV_ONNX_FRONTEND=ON -DENABLE_PYTHON=ON -DENABLE_WHEEL=ON -S /work/src -B /work/build && cmake --build /work/build --parallel $(nproc) --config ${{ env.CMAKE_BUILD_TYPE }} && cmake --install /work/build --config ${{ env.CMAKE_BUILD_TYPE }} --prefix /work/install + cmake --install /work/build --config ${{ env.CMAKE_BUILD_TYPE }} --prefix /work/api_tests --component tests " - name: Pack Artifacts run: mkdir -p ${{ env.BUILD_DIR }} && tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_package.tar.gz working-directory: ${{ env.INSTALL_DIR }} - - name: Build Python API(Python 3.9-3.13) + - name: Build Python API (Python 3.9-3.13) run: | SUPPORTED_PYTHON_VERSIONS=("39" "310" "311" "312" "313") for PY_VER in "${SUPPORTED_PYTHON_VERSIONS[@]}"; do @@ -190,6 +199,10 @@ jobs: " done + - name: Pack openvino_tests + run: tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_tests.tar.gz + working-directory: ${{ env.INSTALL_TEST_DIR }} + # # Upload build artifacts # @@ -208,7 +221,15 @@ jobs: name: openvino_wheels path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl if-no-files-found: 'error' - + + - name: Upload openvino tests package + if: ${{ always() }} + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: openvino_tests + path: ${{ env.BUILD_DIR }}/openvino_tests.tar.gz + if-no-files-found: 'error' + - name: Store artifacts to a shared drive id: store_artifacts if: ${{ always() }} @@ -220,10 +241,34 @@ jobs: ${{ env.INSTALL_WHEELS_DIR }}/wheels storage_dir: ${{ env.PRODUCT_TYPE }} storage_root: ${{ env.ARTIFACTS_SHARE }} - + + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + strategy: + fail-fast: false + matrix: + include: + - python-version: "3.9" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.fedora_33 }} + - python-version: "3.10" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_py310 }} + - python-version: "3.11" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }} + - python-version: "3.12" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }} + - python-version: "3.13" + image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_x64_py313 }} + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ matrix.image }}", "volumes": ["/mount:/mount"]}' + python-version: ${{ matrix.python-version }} + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + Overall_Status: name: ci/gha_overall_status_manylinux2014 - needs: [Smart_CI, Build] + needs: [Smart_CI, Build, Python_API_Tests] if: ${{ always() }} runs-on: ubuntu-latest steps: diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index f4caec8b2458a0..4fc93d73213f78 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -300,6 +300,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' + python-version: '3.11' + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + TensorFlow_Layer_Tests: name: TensorFlow Layer Tests needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ] diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index d874e06a189232..1ad3951ecd3347 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -134,6 +134,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.12' + Python_API_Tests: + name: Python API tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_python_api_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + python-version: '3.12' + if: fromJSON(needs.smart_ci.outputs.affected_components).Python_API.test + Pytorch_Layer_Tests: name: Pytorch Layer Tests needs: [ Docker, Build, Smart_CI ] diff --git a/.github/workflows/windows_conditional_compilation.yml b/.github/workflows/windows_conditional_compilation.yml index 6a5846b514dbd7..2c8ba236d8503c 100644 --- a/.github/workflows/windows_conditional_compilation.yml +++ b/.github/workflows/windows_conditional_compilation.yml @@ -237,7 +237,7 @@ jobs: Compress-Archive @compress $compress = @{ - Path = "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/ov_cpu_func_tests.exe", "${{ env.BUILD_DIR }}/bin/${{ env.CMAKE_BUILD_TYPE }}/ze_loader.dll", "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/openvino_template_extension.dll", "${{ env.OPENVINO_REPO }}/src/tests/test_utils/functional_test_utils/layer_tests_summary", "${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb" + Path = "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/ov_cpu_func_tests.exe", "${{ env.OPENVINO_REPO }}/bin/intel64/${{ env.CMAKE_BUILD_TYPE }}/openvino_template_extension.dll", "${{ env.OPENVINO_REPO }}/src/tests/test_utils/functional_test_utils/layer_tests_summary", "${{ env.INSTALL_DIR }}/runtime/3rdparty/tbb" CompressionLevel = "Optimal" DestinationPath = "${{ env.BUILD_DIR }}/openvino_tests.zip" } diff --git a/.github/workflows/workflows_scans.yml b/.github/workflows/workflows_scans.yml new file mode 100644 index 00000000000000..0a293a4152b9a0 --- /dev/null +++ b/.github/workflows/workflows_scans.yml @@ -0,0 +1,38 @@ +name: GitHub Actions Workflows Scans +on: + workflow_dispatch: {} + push: + paths: + - '.github/workflows/**' + branches: + - 'master' + - 'releases/**' + pull_request: + paths: + - '.github/workflows/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: read-all + +jobs: + semgrep: + name: github_actions_workflows_scan/semgrep + runs-on: ubuntu-latest + if: ${{ github.repository_owner == 'openvinotoolkit' }} + + container: + image: semgrep/semgrep + + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: 'false' + sparse-checkout: .github/workflows + + - name: Semgrep scan + run: | + semgrep scan --error -j 8 --config "p/github-actions" .github/workflows/* diff --git a/.github/workflows/workflows_to_track.txt b/.github/workflows/workflows_to_track.txt new file mode 100644 index 00000000000000..ef3bb633ed7737 --- /dev/null +++ b/.github/workflows/workflows_to_track.txt @@ -0,0 +1,75 @@ +==> ./stale_prs_and_issues.yml <== +name: 'Close stale issues and PRs' +==> ./build_doc.yml <== +name: Documentation +==> ./ovc.yml <== +name: OVC +==> ./ubuntu_22.yml <== +name: Linux (Ubuntu 22.04, Python 3.11) +==> ./windows_conditional_compilation.yml <== +name: Windows Conditional Compilation (VS 2022, Python 3.11) +==> ./send_workflows_to_opentelemetry.yml <== +name: Export workflow metrics (BETA) +==> ./ubuntu_22_dpcpp.yml <== +name: Linux (Ubuntu 22.04, Python 3.11, Intel DPC++ Compiler) +==> ./coverage.yml <== +name: Code coverage +==> ./linux_conditional_compilation.yml <== +name: Linux Static CC (Ubuntu 22.04, Python 3.11, Clang) +==> ./workflows_scans.yml <== +name: GitHub Actions Workflows Scans +==> ./check_pr_commits.yml <== +name: PR Commits +==> ./windows_vs2019_debug.yml <== +name: Windows (VS 2019, Python 3.11, Debug) +==> ./files_size.yml <== +name: Files Size +==> ./cleanup_caches.yml <== +name: Cleanup caches +==> ./mac.yml <== +name: macOS (Python 3.11) +==> ./merge_queue_stub.yml <== +==> ./debian_10_arm.yml <== +name: Debian 10 ARM +==> ./android_arm64.yml <== +name: Android ARM64 with vcpkg +==> ./code_style.yml <== +name: Code Style +==> ./manylinux_2014.yml <== +name: Manylinux 2014 +==> ./linux_arm64.yml <== +name: Linux ARM64 (Ubuntu 20.04, Python 3.11) +==> ./dev_cpu_linux_snippets_libxsmm.yml <== +name: Linux CPU Plugin Snippets with LIBXSMM (Ubuntu 20.04) +==> ./labeler.yml <== +name: "Pull Request Labeler" +==> ./mac_arm64.yml <== +name: macOS ARM64 (Python 3.11) +==> ./dependency_review.yml <== +name: 'Dependency Review' +==> ./fedora_29.yml <== +name: Fedora 29 (RHEL 8.4), Python 3.9 +==> ./code_snippets.yml <== +name: Code snippets +==> ./ubuntu_20.yml <== +name: Linux (Ubuntu 20.04, Python 3.9) +==> ./linux_riscv.yml <== +name: Linux RISC-V with Conan (Ubuntu 22.04, Python 3.10) +==> ./android_x64.yml <== +name: Android x64 +==> ./workflow_rerunner.yml <== +name: Rerun Workflow with Known Errors +==> ./linux_sanitizers.yml <== +name: Linux Sanitizers (Ubuntu 20.04, Python 3.9) +==> ./py_checks.yml <== +name: Python API Checks +==> ./webassembly.yml <== +name: Webassembly +==> ./ubuntu_24.yml <== +name: Linux (Ubuntu 24.04, Python 3.12) +==> ./assign_issue.yml <== +name: Take Issue +==> ./windows_vs2019_release.yml <== +name: Windows (VS 2019, Python 3.11, Release) +==> ./coverity.yml <== +name: Coverity (Ubuntu 20.04, Python 3.11) diff --git a/cmake/packaging/debian.cmake b/cmake/packaging/debian.cmake index 59b312963c180d..c82dca0364b463 100644 --- a/cmake/packaging/debian.cmake +++ b/cmake/packaging/debian.cmake @@ -99,6 +99,7 @@ macro(ov_cpack_settings) 2024.3.0 2024.4.0 2024.5.0 + 2024.6.0 ) ov_check_conflicts_versions(conflicting_versions) diff --git a/cmake/packaging/rpm.cmake b/cmake/packaging/rpm.cmake index a4a63c35858bf9..6e9d535d41cfff 100644 --- a/cmake/packaging/rpm.cmake +++ b/cmake/packaging/rpm.cmake @@ -87,6 +87,7 @@ macro(ov_cpack_settings) 2024.3.0 2024.4.0 2024.5.0 + 2024.6.0 ) ov_check_conflicts_versions(conflicting_versions) diff --git a/docs/RELEASE.MD b/docs/RELEASE.MD new file mode 100644 index 00000000000000..b345431f3f2bcf --- /dev/null +++ b/docs/RELEASE.MD @@ -0,0 +1,29 @@ +# OpenVINO Release Management +The process described below reflects the approach to managing OpenVINO releases. + +## Release Milestones +- Planning +- Execution (development of new features) +- Stabilization (Feature Freeze, Code Freeze milestones) +- Validation +- Distribution + +### Planning +This phase takes 2-4 weeks and involves scoping the backlog, prioritizing it, analyzing, and making commitments by developers for timelines specified by the release manager. + +### Execution (development of new features) +- [OpenVINO Contributing Guide](https://github.com/openvinotoolkit/openvino/blob/master/CONTRIBUTING.md) +- [Code Contribution Guide](https://docs.openvino.ai/2024/about-openvino/contributing/code-contribution-guide.html) +- [OpenVINO First Good Issue](https://github.com/openvinotoolkit/openvino/issues/17502) + +### Stabilization (Feature Freeze, Code Freeze milestones) +- **Feature Freeze**: This milestone ensures that no new features are added to the software after a certain point. This allows the development team to focus on stabilizing and refining the existing features, fixing bugs, and improving performance without the risk of introducing new issues. +- **Code Freeze**: This milestone marks the point where no new code changes are allowed except for critical bug fixes. This helps in ensuring that the final product is stable and reliable, as it minimizes the risk of last-minute changes that could introduce new bugs or instability. + +### Release Validation +- This is a continuous process executed on a regular basis with cadence based on testing type: nightly, bi-weekly, weekly. +- After Code Freeze, the testing team can perform final regression testing to ensure that recent changes have not introduced new bugs and that the software meets the required quality standards. + +### Distribution +- OpenVINO has different types of build distribution: Regular releases, Long-Term Support, Pre-release releases, Nightly builds. Read more here: [OpenVINO Release Policy](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/release-policy.html) +- Different distribution channels are supported. Explore different options here: [OpenVINO Download](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/download.html) diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst index 78a364c18ca4e6..a398432925a983 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks.rst @@ -56,7 +56,8 @@ implemented in your solutions. Click the buttons below to see the chosen benchma :material-regular:`table_view;1.4em` LLM performance for AI PC - .. grid-item:: +.. uncomment under + .. .. grid-item:: .. button-link:: # :class: ovms-toolkit-benchmark-llm-result @@ -163,7 +164,7 @@ For a listing of all platforms and configurations used for testing, refer to the 2024.5, as of November 20, 2024. * OpenVINO Model Server performance results are based on release - 2024.4, as of Sept. 30, 2024. + 2024.5, as of November 20, 2024. The results may not reflect all publicly available updates. Intel technologies' features and benefits depend on system configuration and may require enabled hardware, software, or service diff --git a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst index 5697fcbf6e4d74..cc51b1f5fac969 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks/generative-ai-performance.rst @@ -3,7 +3,7 @@ Most Efficient Large Language Models for AI PC This page is regularly updated to help you identify the best-performing LLMs on the Intel® Core™ Ultra processor family and AI PCs. -The current data is as of OpenVINO 2024.4, 24 Oct. 2024 +The current data is as of OpenVINO 2024.4, 20 Nov. 2024. The tables below list the key performance indicators for inference on built-in GPUs. diff --git a/docs/articles_en/assets/images/genai_main_diagram.svg b/docs/articles_en/assets/images/genai_main_diagram.svg new file mode 100644 index 00000000000000..b01cbd827acb3c --- /dev/null +++ b/docs/articles_en/assets/images/genai_main_diagram.svg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07ce964e115f1e3942cdf381f44b4dc6d466df62c70396306a4f241fb07ea3ed +size 392244 diff --git a/docs/articles_en/documentation/openvino-ecosystem.rst b/docs/articles_en/documentation/openvino-ecosystem.rst index 6735192e95f674..fe4f203428a865 100644 --- a/docs/articles_en/documentation/openvino-ecosystem.rst +++ b/docs/articles_en/documentation/openvino-ecosystem.rst @@ -12,6 +12,7 @@ OpenVINO™ Ecosystem Overview :hidden: openvino-ecosystem/openvino-training-extensions + openvino-ecosystem/openvino-test-drive openvino-ecosystem/datumaro openvino-ecosystem/openvino-security-add-on @@ -102,6 +103,14 @@ development process, empowering teams to produce custom AI models at scale. |hr| +| **Intel® Test Drive** +| :bdg-link-dark:`Github ` + +OpenVINO™ Test Drive is cross-platform graphic user interface application that enables running +generative AI and vision models directly on your computer or edge device using OpenVINO™ Runtime. +|hr| + + | **Tokenizers** | :bdg-link-dark:`Github ` :bdg-link-success:`User Guide ` diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst new file mode 100644 index 00000000000000..527a01bf38a6cf --- /dev/null +++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-test-drive.rst @@ -0,0 +1,109 @@ +=============================================================================================== +OpenVINO™ Test Drive +=============================================================================================== + + +.. meta:: + :description: See how to test your models with OpenVINO, using a simple graphic interface of + Test Drive. + + + +OpenVINO™ Test Drive is a cross-platform graphic user interface application for running and +testing AI models, both generative and vision based. +It can run directly on your computer or on edge devices using +`OpenVINO™ Runtime `__. + +OpenVINO™ Test Drive is developed under the `openvino_testdrive repository `__. + +Use OpenVINO™ Test Drive to: + +* **Chat with LLMs** and evaluate model performance on your computer or edge device; +* **Experiment with different text prompts** to generate images, using Stable + Diffusion and Stable DiffusionXL models (coming soon); +* **Transcribe speech from video**, using Whisper models, including generation + of timestamps (coming soon); +* **Run inference of models** trained by Intel® Geti™ and **visualize the results**. + + + +Installation (Windows) +############################################################################################### + +1. Download the latest archive from the + `release repository `__. + To verify the integrity of the downloaded package, use the SHA-256 file attached. + +2. Extract the zip file and run the *MSIX* installation package. Click the `Install` button to + proceed. + +3. Launch OpenVINO™ Test Drive, clicking the application name in the Windows app list. + + +Quick start +############################################################################################### + +When starting the application, you can import an LLM model from Hugging Face Hub +or upload an Intel® Geti™ model from a local drive. + +Inference of models from Hugging Face ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +1. Find a model on `Hugging Face `__ and import it. + +2. Chat with LLMs via the `Playground` tab. + +3. Use the `Performance metrics` tab to get model performance metrics on your + computer or an edge device. + + + +Inference of models trained with Intel® Geti™ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +1. Download the deployment code for a model in the OpenVINO IR format trained + by Intel® Geti™ (refer to the `Intel® Geti™ documentation `__ + for more details). + +2. Import the deployment code into OpenVINO™ Test Drive, using the *Import model* and then + *Local disk* buttons. + +3. Use the *Live inference* tab to run and visualize results of inference of individual images. + +4. For batch inference, use the *Batch inference* tab and provide paths to the folder + with input images, as well as one for batch inference results. You can do so by filling out + the *Source folder* and *Destination folder* fields. Click *Start* to start batch inference. + + +Build the Application +############################################################################################### + +1. Make sure you `Install flutter SDK `__ + and all its platform-specific dependencies. +2. Build the bindings and place them in the **./bindings** folder. + + OpenVINO™ Test Drive uses bindings to `OpenVINO™ GenAI `__ + and `OpenVINO™ Model API `__, + which are located in the **./openvino_bindings** folder. Refer to the + `GitHub page `__ + for more details. + +3. Start the application, using the following command: + + .. code-block:: console + + flutter run + +Additional Resources +############################################################################################### + +- `OpenVINO™ `__ - a software toolkit + for optimizing and deploying deep learning models. +- `GenAI Repository `__ and + `OpenVINO Tokenizers `__ + - resources and tools for developing and optimizing Generative AI applications. +- `Intel® Geti™ `__ - software for building computer + vision models. +- `OpenVINO™ Model API `__ + - a set of wrapper classes for particular tasks and model architectures. + It simplifies routine procedures, preprocessing and postprocessing of data. diff --git a/docs/articles_en/documentation/openvino-security.rst b/docs/articles_en/documentation/openvino-security.rst index 99cf13161bf243..255dbbd2b62c35 100644 --- a/docs/articles_en/documentation/openvino-security.rst +++ b/docs/articles_en/documentation/openvino-security.rst @@ -55,7 +55,8 @@ Hardware-based protection such as Intel Software Guard Extensions (Intel SGX) ca decryption operation secrets and bind them to a device. For more information, see the `Intel Software Guard Extensions `__. -Use the ``ov::Core::read_model`` to set model representations and weights respectively. +Use the `ov::Core::read_model <../api/c_cpp_api/group__ov__dev__exec__model.html#classov_1_1_core_1ae0576a95f841c3a6f5e46e4802716981>`__ +to set model representations and weights respectively. Currently there is no way to read external weights from memory for ONNX models. The ``ov::Core::read_model(const std::string& model, const Tensor& weights)`` method @@ -65,6 +66,20 @@ should be called with ``weights`` passed as an empty ``ov::Tensor``. :language: cpp :fragment: part1 + +Encrypted models that have already been compiled, in the form of blob files, +can be loaded using the +`ov::Core::import_model <../api/c_cpp_api/group__ov__runtime__cpp__api.html#_CPPv4N2ov4Core12import_modelERNSt7istreamERKNSt6stringERK6AnyMap>`__ +method, as shown in the code sample below: + +.. code-block:: cpp + + ov::Core core; + // Import a model from a blob. + std::ifstream compiled_blob(blob, std::ios_base::in | std::ios_base::binary); + auto compiled_model = core.import_model(compiled_blob, "CPU"); + + Additional Resources #################### diff --git a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst index e10a67fddadb53..4d1eb37007f59d 100644 --- a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst +++ b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst @@ -44,6 +44,7 @@ Below are the instructions on how to install the OpenCL packages on supported Li .. code-block:: sh apt-get install -y ocl-icd-libopencl1 intel-opencl-icd intel-level-zero-gpu level-zero + sudo usermod -a -G render $LOGNAME .. tab-item:: Ubuntu 20.04 LTS :sync: ubuntu-20 @@ -57,6 +58,7 @@ Below are the instructions on how to install the OpenCL packages on supported Li echo 'deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu focal-legacy main' | tee /etc/apt/sources.list.d/intel.gpu.focal.list && \ apt-get update apt-get update && apt-get install -y --no-install-recommends intel-opencl-icd intel-level-zero-gpu level-zero + sudo usermod -a -G render $LOGNAME Alternatively, download older `deb` version from `here `__. Note that older driver version might not include some of the bug fixes and might be not supported on some latest platforms. Check the supported hardware for the versions you are installing. @@ -135,6 +137,6 @@ Additional Resources * `Docker CI framework for Intel® Distribution of OpenVINO™ toolkit `__ * `Get Started with DockerHub CI for Intel® Distribution of OpenVINO™ toolkit `__ * `Dockerfiles with Intel® Distribution of OpenVINO™ toolkit `__ - +* `GPU Driver issue troubleshoot ` diff --git a/docs/articles_en/learn-openvino.rst b/docs/articles_en/learn-openvino.rst index 4fca64051003a7..98797c9c67c126 100644 --- a/docs/articles_en/learn-openvino.rst +++ b/docs/articles_en/learn-openvino.rst @@ -14,7 +14,7 @@ Learn OpenVINO Interactive Tutorials (Python) Sample Applications (Python & C++) - Large Language Model Inference Guide + Generative AI workflow @@ -29,5 +29,5 @@ as well as an experienced user. | :doc:`OpenVINO Samples ` | The OpenVINO samples (Python and C++) are simple console applications that show how to use specific OpenVINO API features. They can assist you in executing tasks such as loading a model, running inference, querying particular device capabilities, etc. -| :doc:`Large Language Models in OpenVINO ` +| :doc:`Generative AI workflow ` | Detailed information on how OpenVINO accelerates Generative AI use cases and what models it supports. This tutorial provides instructions for running Generative AI models using Hugging Face Optimum Intel and Native OpenVINO APIs. diff --git a/docs/articles_en/learn-openvino/llm_inference_guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide.rst index 36c001c015f744..5846d1a484737c 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide.rst @@ -1,140 +1,107 @@ -Large Language Model Inference Guide +Generative AI workflow ======================================== .. meta:: - :description: Explore learning materials, including interactive - Python tutorials and sample console applications that explain - how to use OpenVINO features. + :description: learn how to use OpenVINO to run generative AI models. .. toctree:: :maxdepth: 1 :hidden: - Run LLMs with Optimum Intel - Run LLMs on OpenVINO GenAI Flavor - Run LLMs on Base OpenVINO + Generative Model Preparation + Inference with OpenVINO GenAI + Inference with Optimum Intel + Generative AI with Base OpenVINO (not recommended) OpenVINO Tokenizers -Large Language Models (LLMs) like GPT are transformative deep learning networks capable of a -broad range of natural language tasks, from text generation to language translation. OpenVINO -optimizes the deployment of these models, enhancing their performance and integration into -various applications. This guide shows how to use LLMs with OpenVINO, from model loading and -conversion to advanced use cases. + + +Generative AI is a specific area of Deep Learning models used for producing new and “original” +data, based on input in the form of image, sound, or natural language text. Due to their +complexity and size, generative AI pipelines are more difficult to deploy and run efficiently. +OpenVINO simplifies the process and ensures high-performance integrations, with the following +options: + +.. tab-set:: + + .. tab-item:: OpenVINO GenAI + + | - Suggested for production deployment for the supported use cases. + | - Smaller footprint and fewer dependencies. + | - More optimization and customization options. + | - Available in both Python and C++. + | - A limited set of supported use cases. + + :doc:`Install the OpenVINO GenAI package <../get-started/install-openvino/install-openvino-genai>` + and run generative models out of the box. With custom + API and tokenizers, among other components, it manages the essential tasks such as the + text generation loop, tokenization, and scheduling, offering ease of use and high + performance. + + .. tab-item:: Hugging Face integration + + | - Suggested for prototyping and, if the use case is not covered by OpenVINO GenAI, production. + | - Bigger footprint and more dependencies. + | - Limited customization due to Hugging Face dependency. + | - Not usable for C++ applications. + | - A very wide range of supported models. + + Using Optimum Intel is a great way to experiment with different models and scenarios, + thanks to a simple interface for the popular API and infrastructure offered by Hugging Face. + It also enables weight compression with + `Neural Network Compression Framework (NNCF) `__, + as well as conversion on the fly. For integration with the final product it may offer + lower performance, though. + +`Check out the GenAI Quick-start Guide [PDF] `__ The advantages of using OpenVINO for LLM deployment: -* **OpenVINO offers optimized LLM inference**: - provides a full C/C++ API, leading to faster operation than Python-based runtimes; includes a - Python API for rapid development, with the option for further optimization in C++. -* **Compatible with diverse hardware**: - supports CPUs, GPUs, and neural accelerators across ARM and x86/x64 architectures, integrated - Intel® Processor Graphics, discrete Intel® Arc™ A-Series Graphics, and discrete Intel® Data - Center GPU Flex Series; features automated optimization to maximize performance on target - hardware. -* **Requires fewer dependencies**: - than frameworks like Hugging Face and PyTorch, resulting in a smaller binary size and reduced - memory footprint, making deployments easier and updates more manageable. -* **Provides compression and precision management techniques**: - such as 8-bit and 4-bit weight compression, including embedding layers, and storage format - reduction. This includes fp16 precision for non-compressed models and int8/int4 for compressed - models, like GPTQ models from `Hugging Face `__. -* **Supports a wide range of deep learning models and architectures**: - including text, image, and audio generative models like Llama 2, MPT, OPT, Stable Diffusion, - Stable Diffusion XL. This enables the development of multimodal applications, allowing for - write-once, deploy-anywhere capabilities. -* **Enhances inference capabilities**: - fused inference primitives such as Scaled Dot Product Attention, Rotary Positional Embedding, - Group Query Attention, and Mixture of Experts. It also offers advanced features like in-place - KV-cache, dynamic quantization, KV-cache quantization and encapsulation, dynamic beam size - configuration, and speculative sampling. -* **Provides stateful model optimization**: - models from the Hugging Face Transformers are converted into a stateful form, optimizing - inference performance and memory usage in long-running text generation tasks by managing past - KV-cache tensors more efficiently internally. This feature is automatically activated for many - supported models, while unsupported ones remain stateless. Learn more about the - :doc:`Stateful models and State API <../openvino-workflow/running-inference/stateful-models>`. - -OpenVINO offers three main paths for Generative AI use cases: - -* **Hugging Face**: use OpenVINO as a backend for Hugging Face frameworks (transformers, - diffusers) through the `Optimum Intel `__ - extension. -* **OpenVINO GenAI Flavor**: use OpenVINO GenAI APIs (Python and C++). -* **Base OpenVINO**: use OpenVINO native APIs (Python and C++) with - `custom pipeline code `__. - -In both cases, the OpenVINO runtime is used for inference, and OpenVINO tools are used for -optimization. The main differences are in footprint size, ease of use, and customizability. - -The Hugging Face API is easy to learn, provides a simple interface and hides the complexity of -model initialization and text generation for a better developer experience. However, it has more -dependencies, less customization, and cannot be ported to C/C++. - -The OpenVINO GenAI Flavor reduces the complexity of LLMs implementation by -automatically managing essential tasks like the text generation loop, tokenization, -and scheduling. The Native OpenVINO API provides a more hands-on experience, -requiring manual setup of these functions. Both methods are designed to minimize dependencies -and the overall application footprint and enable the use of generative models in C++ applications. - -It is recommended to start with Hugging Face frameworks to experiment with different models and -scenarios. Then the model can be used with OpenVINO APIs if it needs to be optimized -further. Optimum Intel provides interfaces that enable model optimization (weight compression) -using `Neural Network Compression Framework (NNCF) `__, -and export models to the OpenVINO model format for use in native API applications. - -Proceed to run LLMs with: +.. dropdown:: Fewer dependencies and smaller footprint + :animate: fade-in-slide-down + :color: secondary + + Less bloated than frameworks such as Hugging Face and PyTorch, with a smaller binary size and reduced + memory footprint, makes deployments easier and updates more manageable. + +.. dropdown:: Compression and precision management + :animate: fade-in-slide-down + :color: secondary + + Techniques such as 8-bit and 4-bit weight compression, including embedding layers, and storage + format reduction. This includes fp16 precision for non-compressed models and int8/int4 for + compressed models, like GPTQ models from `Hugging Face `__. + +.. dropdown:: Enhanced inference capabilities + :animate: fade-in-slide-down + :color: secondary + + Advanced features like in-place KV-cache, dynamic quantization, KV-cache quantization and + encapsulation, dynamic beam size configuration, and speculative sampling, and more are + available. + +.. dropdown:: Stateful model optimization + :animate: fade-in-slide-down + :color: secondary + + Models from the Hugging Face Transformers are converted into a stateful form, optimizing + inference performance and memory usage in long-running text generation tasks by managing past + KV-cache tensors more efficiently internally. This feature is automatically activated for + many supported models, while unsupported ones remain stateless. Learn more about the + :doc:`Stateful models and State API <../openvino-workflow/running-inference/stateful-models>`. + +.. dropdown:: Optimized LLM inference + :animate: fade-in-slide-down + :color: secondary + + Includes a Python API for rapid development and C++ for further optimization, offering + better performance than Python-based runtimes. + + +Proceed to guides on: -* :doc:`Hugging Face and Optimum Intel <./llm_inference_guide/llm-inference-hf>` * :doc:`OpenVINO GenAI Flavor <./llm_inference_guide/genai-guide>` -* :doc:`Native OpenVINO API <./llm_inference_guide/llm-inference-native-ov>` - -The table below summarizes the differences between Hugging Face and the native OpenVINO API -approaches. - -.. dropdown:: Differences between Hugging Face and the native OpenVINO API - - .. list-table:: - :widths: 20 25 55 - :header-rows: 1 - - * - - - Hugging Face through OpenVINO - - OpenVINO Native API - * - Model support - - Supports transformer-based models such as LLMs - - Supports all model architectures from most frameworks - * - APIs - - Python (Hugging Face API) - - Python, C++ (OpenVINO API) - * - Model Format - - Source Framework / OpenVINO - - Source Framework / OpenVINO - * - Inference code - - Hugging Face based - - Custom inference pipelines - * - Additional dependencies - - Many Hugging Face dependencies - - Lightweight (e.g. numpy, etc.) - * - Application footprint - - Large - - Small - * - Pre/post-processing and glue code - - Provided through high-level Hugging Face APIs - - Must be custom implemented (see OpenVINO samples and notebooks) - * - Performance - - Good, but less efficient compared to native APIs - - Inherent speed advantage with C++, but requires hands-on optimization - * - Flexibility - - Constrained to Hugging Face API - - High flexibility with Python and C++; allows custom coding - * - Learning Curve and Effort - - Lower learning curve; quick to integrate - - Higher learning curve; requires more effort in integration - * - Ideal Use Case - - Ideal for quick prototyping and Python-centric projects - - Best suited for high-performance, resource-optimized production environments - * - Model Serving - - Paid service, based on CPU/GPU usage with Hugging Face - - Free code solution, run script for own server; costs may incur for cloud services - like AWS but generally cheaper than Hugging Face rates +* :doc:`Hugging Face and Optimum Intel <./llm_inference_guide/llm-inference-hf>` + + diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst index 41e5cbb5733c58..d725b306d57908 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst @@ -1,4 +1,4 @@ -Run LLMs with OpenVINO GenAI Flavor on NPU +Inference with OpenVINO GenAI ========================================== .. meta:: diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index f18b66915fc3ce..42c1c3fb47aa42 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -1,4 +1,4 @@ -Run LLM Inference on OpenVINO with the GenAI Flavor +Inference with OpenVINO GenAI =============================================================================================== .. meta:: @@ -9,39 +9,332 @@ Run LLM Inference on OpenVINO with the GenAI Flavor :hidden: NPU inference of LLMs - genai-guide/genai-use-cases -This guide will show you how to integrate the OpenVINO GenAI flavor into your application, covering -loading a model and passing the input context to receive generated text. Note that the vanilla flavor of OpenVINO -will not work with these instructions, make sure to -:doc:`install OpenVINO GenAI <../../get-started/install-openvino/install-openvino-genai>`. +OpenVINO™ GenAI is a library of pipelines and methods, extending the OpenVINO runtime to work +with generative AI models more efficiently. This article provides reference code and guidance +on its usage. Note that the base OpenVINO version will not work with these instructions, +make sure to :doc:`install OpenVINO with GenAI <../../get-started/install-openvino/install-openvino-genai>`. -.. note:: +.. image:: ../../assets/images/genai_main_diagram.svg + :align: center + :alt: OpenVINO GenAI workflow diagram - The examples use the CPU as the target device, however, the GPU is also supported. - Note that for the LLM pipeline, the GPU is used only for inference, while token selection, tokenization, and - detokenization remain on the CPU, for efficiency. Tokenizers are represented as a separate model and also run - on the CPU. -1. Export an LLM model via Hugging Face Optimum-Intel. A chat-tuned TinyLlama model is used in this example: +| Here is sample code for several Generative AI use case scenarios. Note that these are very basic + examples and may need adjustments for your specific needs, like changing the inference device. +| For a more extensive instruction and additional options, see the + `step-by-step chat-bot guide <#chat-bot-use-case-step-by-step>`__ below. - .. code-block:: python +.. dropdown:: Text-to-Image Generation - optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + .. tab-set:: + + .. tab-item:: Python + :sync: python + + .. tab-set:: + + .. tab-item:: main.py + :name: mainpy + + .. code-block:: python + + import openvino_genai + from PIL import Image + import numpy as np + + class Generator(openvino_genai.Generator): + def __init__(self, seed, mu=0.0, sigma=1.0): + openvino_genai.Generator.__init__(self) + np.random.seed(seed) + self.mu = mu + self.sigma = sigma + + def next(self): + return np.random.normal(self.mu, self.sigma) + + + def infer(model_dir: str, prompt: str): + device = 'CPU' # GPU can be used as well + random_generator = Generator(42) + pipe = openvino_genai.Text2ImagePipeline(model_dir, device) + image_tensor = pipe.generate( + prompt, + width=512, + height=512, + num_inference_steps=20, + num_images_per_prompt=1, + random_generator=random_generator + ) + + image = Image.fromarray(image_tensor.data[0]) + image.save("image.bmp") + + .. tab-item:: LoRA.py + :name: lorapy + + .. code-block:: python + + import openvino as ov + import openvino_genai + import numpy as np + import sys + + + class Generator(openvino_genai.Generator): + def __init__(self, seed, mu=0.0, sigma=1.0): + openvino_genai.Generator.__init__(self) + np.random.seed(seed) + self.mu = mu + self.sigma = sigma + + def next(self): + return np.random.normal(self.mu, self.sigma) + + + def image_write(path: str, image_tensor: ov.Tensor): + from PIL import Image + image = Image.fromarray(image_tensor.data[0]) + image.save(path) + + + def infer(models_path: str, prompt: str): + prompt = "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + + device = "CPU" # GPU, NPU can be used as well + adapter_config = openvino_genai.AdapterConfig() + + for i in range(int(len(adapters) / 2)): + adapter = openvino_genai.Adapter(adapters[2 * i]) + alpha = float(adapters[2 * i + 1]) + adapter_config.add(adapter, alpha) + + pipe = openvino_genai.Text2ImagePipeline(models_path, device, adapters=adapter_config) + print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp") + image = pipe.generate(prompt, + random_generator=Generator(42), + width=512, + height=896, + num_inference_steps=20) + + image_write("lora.bmp", image) + print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp") + image = pipe.generate(prompt, + adapters=openvino_genai.AdapterConfig(), + random_generator=Generator(42), + width=512, + height=896, + num_inference_steps=20 + ) + image_write("baseline.bmp", image) + + For more information, refer to the + `Python sample `__ + + .. tab-item:: C++ + :sync: cpp + + .. tab-set:: + + .. tab-item:: main.cpp + :name: maincpp + + .. code-block:: cpp + + #include "openvino/genai/text2image/pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)); + + imwrite("image_%d.bmp", image, true); + + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + .. tab-item:: LoRA.cpp + :name: loracpp + + .. code-block:: cpp + + #include "openvino/genai/text2image/pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::AdapterConfig adapter_config; + for(size_t i = 0; i < (argc - 3)/2; ++i) { + ov::genai::Adapter adapter(argv[3 + 2*i]); + float alpha = std::atof(argv[3 + 2*i + 1]); + adapter_config.add(adapter, alpha); + } - *Optional*. Optimize the model: + ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); - The model is an optimized OpenVINO IR with FP16 precision. For enhanced LLM performance, - it is recommended to use lower precision for model weights, such as INT4, and to compress weights - using NNCF during model export directly: + std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; + ov::Tensor image = pipe.generate(prompt, + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("lora.bmp", image, true); - .. code-block:: python + std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; + image = pipe.generate(prompt, + ov::genai::adapters(), + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("baseline.bmp", image, true); - optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + For more information, refer to the + `C++ sample `__ -2. Perform generation using the new GenAI API: + +.. dropdown:: Speech Recognition + + The application performs inference on speech recognition Whisper Models. The samples include + the ``WhisperPipeline`` class and use audio files in WAV format at a sampling rate of 16 kHz + as input. + + .. tab-set:: + + .. tab-item:: Python + :sync: cpp + + .. code-block:: python + + import openvino_genai + import librosa + + + def read_wav(filepath): + raw_speech, samplerate = librosa.load(filepath, sr=16000) + return raw_speech.tolist() + + + def infer(model_dir: str, wav_file_path: str): + device = "CPU" # GPU or NPU can be used as well. + pipe = openvino_genai.WhisperPipeline(model_dir, device) + + # The pipeline expects normalized audio with a sampling rate of 16kHz. + raw_speech = read_wav(wav_file_path) + result = pipe.generate( + raw_speech, + max_new_tokens=100, + language="<|en|>", + task="transcribe", + return_timestamps=True, + ) + + print(result) + + for chunk in result.chunks: + print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "audio_utils.hpp" + #include "openvino/genai/whisper_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (3 > argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); + } + + std::filesystem::path models_path = argv[1]; + std::string wav_file_path = argv[2]; + std::string device = "CPU"; // GPU or NPU can be used as well. + + ov::genai::WhisperPipeline pipeline(models_path, device); + + ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); + config.max_new_tokens = 100; + config.language = "<|en|>"; + config.task = "transcribe"; + config.return_timestamps = true; + + // The pipeline expects normalized audio with a sampling rate of 16kHz. + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + auto result = pipeline.generate(raw_speech, config); + + std::cout << result << "\n"; + + for (auto& chunk : *result.chunks) { + std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; + } + + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } + + For more information, refer to the + `C++ sample `__. + + +.. dropdown:: Using GenAI in Chat Scenario + + For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache + across inputs may prove beneficial. The ``start_chat`` and ``finish_chat`` chat-specific + methods are used to mark a conversation session, as shown in the samples below: .. tab-set:: @@ -50,9 +343,35 @@ will not work with these instructions, make sure to .. code-block:: python - import openvino_genai as ov_genai - pipe = ov_genai.LLMPipeline(model_path, "CPU") - print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) + import openvino_genai + + + def streamer(subword): + print(subword, end='', flush=True) + return False + + + def infer(model_dir: str): + device = 'CPU' # GPU can be used as well. + pipe = openvino_genai.LLMPipeline(model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe.generate(prompt, config, streamer) + print('\n----------') + pipe.finish_chat() + + + + For more information, refer to the + `Python sample `__. .. tab-item:: C++ :sync: cpp @@ -60,27 +379,250 @@ will not work with these instructions, make sure to .. code-block:: cpp #include "openvino/genai/llm_pipeline.hpp" - #include - int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)); + int main(int argc, char* argv[]) try { + if (2 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); + } + std::string prompt; + std::string models_path = argv[1]; + + std::string device = "CPU"; // GPU, NPU can be used as well + ov::genai::LLMPipeline pipe(models_path, device); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + std::function streamer = [](std::string word) { + std::cout << word << std::flush; + return false; + }; + + pipe.start_chat(); + std::cout << "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, config, streamer); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; } -The `LLMPipeline` is the main object used for decoding. You can construct it directly from the -folder with the converted model. It will automatically load the main model, tokenizer, detokenizer, -and the default generation configuration. -Once the model is exported from Hugging Face Optimum-Intel, it already contains all the information -necessary for execution, including the tokenizer/detokenizer and the generation config, ensuring that -its results match those generated by Hugging Face. + For more information, refer to the + `C++ sample `__ + + +.. dropdown:: Using GenAI with Vision Language Models + + OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for + inference of multimodal text-generation Vision Language Models (VLMs). + With a text prompt and an image as input, VLMPipeline can generate text using + models such as LLava or MiniCPM-V. See the chat scenario presented + in the samples below: + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import numpy as np + import openvino_genai + from PIL import Image + from openvino import Tensor + from pathlib import Path + + + def streamer(subword: str) -> bool: + print(subword, end='', flush=True) + + + def read_image(path: str) -> Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return Tensor(image_data) + + + def read_images(path: str) -> list[Tensor]: + entry = Path(path) + if entry.is_dir(): + return [read_image(str(file)) for file in sorted(entry.iterdir())] + return [read_image(path)] + + + def infer(model_dir: str, image_dir: str): + rgbs = read_images(image_dir) + device = 'CPU' # GPU can be used as well. + enable_compile_cache = dict() + if "GPU" == device: + enable_compile_cache["CACHE_DIR"] = "vlm_cache" + pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + prompt = input('question:\n') + pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer) + + while True: + try: + prompt = input("\n----------\n" + "question:\n") + except EOFError: + break + pipe.generate(prompt, generation_config=config, streamer=streamer) + pipe.finish_chat() + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "load_image.hpp" + #include + #include + + bool print_subword(std::string&& subword) { + return !(std::cout << subword << std::flush); + } + + int main(int argc, char* argv[]) try { + if (3 != argc) { + throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + } + + std::vector rgbs = utils::load_images(argv[2]); + + std::string device = "CPU"; // GPU can be used as well. + ov::AnyMap enable_compile_cache; + if ("GPU" == device) { + enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); + } + ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + + ov::genai::GenerationConfig generation_config; + generation_config.max_new_tokens = 100; + + std::string prompt; + + pipe.start_chat(); + std::cout << "question:\n"; + + std::getline(std::cin, prompt); + pipe.generate(prompt, + ov::genai::images(rgbs), + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ + + +| + + +Chat-bot use case - step by step +############################################################################################### + +This example will show you how to create a chat-bot functionality, using the ``ov_genai.LLMPipeline`` +and a chat-tuned TinyLlama model. Apart from the basic implementation, it provides additional +optimization methods. + +Although CPU is used as inference device in the samples below, you may choose GPU instead. +Note that tasks such as token selection, tokenization, and detokenization are always handled +by CPU only. Tokenizers, represented as a separate model, are also run on CPU. + +Running the model ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +You start with exporting an LLM model via Hugging Face Optimum-Intel. Note that the precision +of ``int4`` is used, instead of the original ``fp16``, for better performance. The weight +compression is done by NNCF at the model export stage. The exported model contains all the +information necessary for execution, including the tokenizer/detokenizer and the generation +config, ensuring that its results match those generated by Hugging Face. + +The `LLMPipeline` is the main object used for decoding and handles all the necessary steps. +You can construct it directly from the folder with the converted model. + + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: console + + optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + + .. code-block:: python + + import openvino_genai as ov_genai + pipe = ov_genai.LLMPipeline(model_path, "CPU") + print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: console + + optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + + .. code-block:: cpp + + #include "openvino/genai/llm_pipeline.hpp" + #include + + int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)); + } + + Streaming the Output -########################### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -For more interactive UIs during generation, streaming of model output tokens is supported. See the example -below, where a lambda function outputs words to the console immediately upon generation: +For more interactive UIs during generation, you can stream output tokens. In this example, a +lambda function outputs words to the console immediately upon generation: .. tab-set:: @@ -177,12 +719,10 @@ You can also create your custom streamer for more sophisticated processing: Optimizing Generation with Grouped Beam Search -####################################################### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Leverage grouped beam search decoding and configure generation_config for better text generation -quality and efficient batch processing in GenAI applications. - -Specify generation_config to use grouped beam search: +For better text generation quality and more efficient batch processing, specify +``generation_config`` to leverage grouped beam search decoding. .. tab-set:: @@ -218,22 +758,19 @@ Specify generation_config to use grouped beam search: cout << pipe.generate("The Sun is yellow because", config); } + Efficient Text Generation via Speculative Decoding -################################################## +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Speculative decoding (or assisted-generation) enables faster token generation -when an additional smaller draft model is used alongside the main model. -The draft model predicts the next K tokens one by one in an autoregressive manner, -while the main model validates these predictions and corrects them if necessary. +when an additional smaller draft model is used alongside the main model. This reduces the +number of infer requests to the main model, increasing performance. -Each predicted token is compared, and when there is a difference between the draft and -main model, the last token predicted by the main model is kept. Then, the draft -model acquires this token and tries prediction of the next K tokens, -thus repeating the cycle. +The draft model predicts the next K tokens one by one in an autoregressive manner. The main +model validates these predictions and corrects them if necessary - in case of +a discrepancy, the main model prediction is used. Then, the draft model acquires this token and +runs prediction of the next K tokens, thus repeating the cycle. -This method eliminates the need for multiple infer requests to the main model, -which results in increased performance. Its implementation in the pipeline is -shown in the code samples below: .. tab-set:: @@ -265,7 +802,7 @@ shown in the code samples below: config.max_new_tokens = 100 config.num_assistant_tokens = 5 - pipe.generate(prompt, config, streamer) + pipe.generate("The Sun is yellow because", config, streamer) For more information, refer to the @@ -310,7 +847,7 @@ shown in the code samples below: return false; }; - pipe.generate(prompt, config, streamer); + pipe.generate("The Sun is yellow because", config, streamer); } catch (const std::exception& error) { try { std::cerr << error.what() << '\n'; @@ -327,10 +864,18 @@ shown in the code samples below: For more information, refer to the `C++ sample `__ + + + + + + + Comparing with Hugging Face Results ####################################### -Compare and analyze results with those generated by Hugging Face models. +You can compare the results of the above example with those generated by Hugging Face models by +running the following code: .. tab-set:: @@ -358,30 +903,35 @@ Compare and analyze results with those generated by Hugging Face models. assert hf_output == ov_output -GenAI API -####################################### -OpenVINO GenAI Flavor includes the following API: -* generation_config - defines a configuration class for text generation, enabling customization of the generation process such as the maximum length of the generated text, whether to ignore end-of-sentence tokens, and the specifics of the decoding strategy (greedy, beam search, or multinomial sampling). -* llm_pipeline - provides classes and utilities for text generation, including a pipeline for processing inputs, generating text, and managing outputs with configurable options. -* streamer_base - an abstract base class for creating streamers. -* tokenizer - the tokenizer class for text encoding and decoding. +GenAI API +####################################### +The use case described here uses the following OpenVINO GenAI API methods: + +* generation_config - defines a configuration class for text generation, + enabling customization of the generation process such as the maximum length of + the generated text, whether to ignore end-of-sentence tokens, and the specifics + of the decoding strategy (greedy, beam search, or multinomial sampling). +* llm_pipeline - provides classes and utilities for processing inputs, + text generation, and managing outputs with configurable options. +* streamer_base - an abstract base class for creating streamers. +* tokenizer - the tokenizer class for text encoding and decoding. * visibility - controls the visibility of the GenAI library. -Learn more in the `GenAI API reference `__. +Learn more from the `GenAI API reference `__. Additional Resources #################### * `OpenVINO GenAI Repo `__ * `OpenVINO GenAI Samples `__ +* A Jupyter notebook demonstrating + `Visual-language assistant with MiniCPM-V2 and OpenVINO `__ * `OpenVINO Tokenizers `__ * `Neural Network Compression Framework `__ - - diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst deleted file mode 100644 index 245a2648aab491..00000000000000 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst +++ /dev/null @@ -1,563 +0,0 @@ -GenAI Use Cases -===================== - -This article provides several use case scenarios for Generative AI model -inference. The applications presented in the code samples below -only require minimal configuration, like setting an inference device. Feel free -to explore and modify the source code as you need. - - -Using GenAI for Text-to-Image Generation -######################################## - -Examples below demonstrate inference on text-to-image models, like Stable Diffusion -1.5, 2.1, and LCM, with a text prompt as input. The :ref:`main.cpp ` -sample shows basic usage of the ``Text2ImagePipeline`` pipeline. -:ref:`lora.cpp ` shows how to apply LoRA adapters to the pipeline. - - -.. tab-set:: - - .. tab-item:: Python - :sync: python - - .. tab-set:: - - .. tab-item:: main.py - :name: mainpy - - .. code-block:: python - - import openvino_genai - from PIL import Image - import numpy as np - - class Generator(openvino_genai.Generator): - def __init__(self, seed, mu=0.0, sigma=1.0): - openvino_genai.Generator.__init__(self) - np.random.seed(seed) - self.mu = mu - self.sigma = sigma - - def next(self): - return np.random.normal(self.mu, self.sigma) - - - def infer(model_dir: str, prompt: str): - device = 'CPU' # GPU can be used as well - random_generator = Generator(42) - pipe = openvino_genai.Text2ImagePipeline(model_dir, device) - image_tensor = pipe.generate( - prompt, - width=512, - height=512, - num_inference_steps=20, - num_images_per_prompt=1, - random_generator=random_generator - ) - - image = Image.fromarray(image_tensor.data[0]) - image.save("image.bmp") - - .. tab-item:: LoRA.py - :name: lorapy - - .. code-block:: python - - import openvino as ov - import openvino_genai - import numpy as np - import sys - - - class Generator(openvino_genai.Generator): - def __init__(self, seed, mu=0.0, sigma=1.0): - openvino_genai.Generator.__init__(self) - np.random.seed(seed) - self.mu = mu - self.sigma = sigma - - def next(self): - return np.random.normal(self.mu, self.sigma) - - - def image_write(path: str, image_tensor: ov.Tensor): - from PIL import Image - image = Image.fromarray(image_tensor.data[0]) - image.save(path) - - - def infer(models_path: str, prompt: str): - prompt = "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" - - device = "CPU" # GPU, NPU can be used as well - adapter_config = openvino_genai.AdapterConfig() - - for i in range(int(len(adapters) / 2)): - adapter = openvino_genai.Adapter(adapters[2 * i]) - alpha = float(adapters[2 * i + 1]) - adapter_config.add(adapter, alpha) - - pipe = openvino_genai.Text2ImagePipeline(models_path, device, adapters=adapter_config) - print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp") - image = pipe.generate(prompt, - random_generator=Generator(42), - width=512, - height=896, - num_inference_steps=20) - - image_write("lora.bmp", image) - print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp") - image = pipe.generate(prompt, - adapters=openvino_genai.AdapterConfig(), - random_generator=Generator(42), - width=512, - height=896, - num_inference_steps=20 - ) - image_write("baseline.bmp", image) - - For more information, refer to the - `Python sample `__ - - .. tab-item:: C++ - :sync: cpp - - .. tab-set:: - - .. tab-item:: main.cpp - :name: maincpp - - .. code-block:: cpp - - #include "openvino/genai/text2image/pipeline.hpp" - - #include "imwrite.hpp" - - int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::Text2ImagePipeline pipe(models_path, device); - ov::Tensor image = pipe.generate(prompt, - ov::genai::width(512), - ov::genai::height(512), - ov::genai::num_inference_steps(20), - ov::genai::num_images_per_prompt(1)); - - imwrite("image_%d.bmp", image, true); - - return EXIT_SUCCESS; - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - .. tab-item:: LoRA.cpp - :name: loracpp - - .. code-block:: cpp - - #include "openvino/genai/text2image/pipeline.hpp" - - #include "imwrite.hpp" - - int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::AdapterConfig adapter_config; - for(size_t i = 0; i < (argc - 3)/2; ++i) { - ov::genai::Adapter adapter(argv[3 + 2*i]); - float alpha = std::atof(argv[3 + 2*i + 1]); - adapter_config.add(adapter, alpha); - } - - ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); - - std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; - ov::Tensor image = pipe.generate(prompt, - ov::genai::random_generator(std::make_shared(42)), - ov::genai::width(512), - ov::genai::height(896), - ov::genai::num_inference_steps(20)); - imwrite("lora.bmp", image, true); - - std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; - image = pipe.generate(prompt, - ov::genai::adapters(), - ov::genai::random_generator(std::make_shared(42)), - ov::genai::width(512), - ov::genai::height(896), - ov::genai::num_inference_steps(20)); - imwrite("baseline.bmp", image, true); - - return EXIT_SUCCESS; - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - - - - - -Using GenAI in Speech Recognition -################################# - - -The application, shown in code samples below, performs inference on speech -recognition Whisper Models. The samples include the ``WhisperPipeline`` class -and use audio files in WAV format at a sampling rate of 16 kHz as input. - -.. tab-set:: - - .. tab-item:: Python - :sync: cpp - - .. code-block:: python - - import openvino_genai - import librosa - - - def read_wav(filepath): - raw_speech, samplerate = librosa.load(filepath, sr=16000) - return raw_speech.tolist() - - - def infer(model_dir: str, wav_file_path: str): - device = "CPU" # GPU or NPU can be used as well. - pipe = openvino_genai.WhisperPipeline(model_dir, device) - - # The pipeline expects normalized audio with a sampling rate of 16kHz. - raw_speech = read_wav(wav_file_path) - result = pipe.generate( - raw_speech, - max_new_tokens=100, - language="<|en|>", - task="transcribe", - return_timestamps=True, - ) - - print(result) - - for chunk in result.chunks: - print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "audio_utils.hpp" - #include "openvino/genai/whisper_pipeline.hpp" - - int main(int argc, char* argv[]) try { - if (3 > argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); - } - - std::filesystem::path models_path = argv[1]; - std::string wav_file_path = argv[2]; - std::string device = "CPU"; // GPU or NPU can be used as well. - - ov::genai::WhisperPipeline pipeline(models_path, device); - - ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); - config.max_new_tokens = 100; - config.language = "<|en|>"; - config.task = "transcribe"; - config.return_timestamps = true; - - // The pipeline expects normalized audio with a sampling rate of 16kHz. - ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); - auto result = pipeline.generate(raw_speech, config); - - std::cout << result << "\n"; - - for (auto& chunk : *result.chunks) { - std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; - } - - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) { - } - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) { - } - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__. - - -Using GenAI in Chat Scenario -############################ - -For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache across inputs -may prove beneficial. The ``start_chat`` and ``finish_chat`` chat-specific methods are used to -mark a conversation session, as shown in the samples below: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: python - - import openvino_genai - - - def streamer(subword): - print(subword, end='', flush=True) - return False - - - def infer(model_dir: str): - device = 'CPU' # GPU can be used as well. - pipe = openvino_genai.LLMPipeline(model_dir, device) - - config = openvino_genai.GenerationConfig() - config.max_new_tokens = 100 - - pipe.start_chat() - while True: - try: - prompt = input('question:\n') - except EOFError: - break - pipe.generate(prompt, config, streamer) - print('\n----------') - pipe.finish_chat() - - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "openvino/genai/llm_pipeline.hpp" - - int main(int argc, char* argv[]) try { - if (2 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); - } - std::string prompt; - std::string models_path = argv[1]; - - std::string device = "CPU"; // GPU, NPU can be used as well - ov::genai::LLMPipeline pipe(models_path, device); - - ov::genai::GenerationConfig config; - config.max_new_tokens = 100; - std::function streamer = [](std::string word) { - std::cout << word << std::flush; - return false; - }; - - pipe.start_chat(); - std::cout << "question:\n"; - while (std::getline(std::cin, prompt)) { - pipe.generate(prompt, config, streamer); - std::cout << "\n----------\n" - "question:\n"; - } - pipe.finish_chat(); - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - - -Using GenAI with Vision Language Models -####################################### - -OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for -inference of multimodal text-generation Vision Language Models (VLMs). -With a text prompt and an image as input, VLMPipeline can generate text using -models such as LLava or MiniCPM-V. See the chat scenario presented -in the samples below: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: python - - import numpy as np - import openvino_genai - from PIL import Image - from openvino import Tensor - from pathlib import Path - - - def streamer(subword: str) -> bool: - print(subword, end='', flush=True) - - - def read_image(path: str) -> Tensor: - pic = Image.open(path).convert("RGB") - image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) - return Tensor(image_data) - - - def read_images(path: str) -> list[Tensor]: - entry = Path(path) - if entry.is_dir(): - return [read_image(str(file)) for file in sorted(entry.iterdir())] - return [read_image(path)] - - - def infer(model_dir: str, image_dir: str): - rgbs = read_images(image_dir) - device = 'CPU' # GPU can be used as well. - enable_compile_cache = dict() - if "GPU" == device: - enable_compile_cache["CACHE_DIR"] = "vlm_cache" - pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache) - - config = openvino_genai.GenerationConfig() - config.max_new_tokens = 100 - - pipe.start_chat() - prompt = input('question:\n') - pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer) - - while True: - try: - prompt = input("\n----------\n" - "question:\n") - except EOFError: - break - pipe.generate(prompt, generation_config=config, streamer=streamer) - pipe.finish_chat() - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "load_image.hpp" - #include - #include - - bool print_subword(std::string&& subword) { - return !(std::cout << subword << std::flush); - } - - int main(int argc, char* argv[]) try { - if (3 != argc) { - throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); - } - - std::vector rgbs = utils::load_images(argv[2]); - - std::string device = "CPU"; // GPU can be used as well. - ov::AnyMap enable_compile_cache; - if ("GPU" == device) { - enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); - } - ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); - - ov::genai::GenerationConfig generation_config; - generation_config.max_new_tokens = 100; - - std::string prompt; - - pipe.start_chat(); - std::cout << "question:\n"; - - std::getline(std::cin, prompt); - pipe.generate(prompt, - ov::genai::images(rgbs), - ov::genai::generation_config(generation_config), - ov::genai::streamer(print_subword)); - std::cout << "\n----------\n" - "question:\n"; - while (std::getline(std::cin, prompt)) { - pipe.generate(prompt, - ov::genai::generation_config(generation_config), - ov::genai::streamer(print_subword)); - std::cout << "\n----------\n" - "question:\n"; - } - pipe.finish_chat(); - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - -Additional Resources -##################### - -* :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>` -* `OpenVINO GenAI Repo `__ -* `OpenVINO GenAI Samples `__ -* A Jupyter notebook demonstrating - `Visual-language assistant with MiniCPM-V2 and OpenVINO `__ -* `OpenVINO Tokenizers `__ diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst new file mode 100644 index 00000000000000..53b8d5440ca855 --- /dev/null +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-model-preparation.rst @@ -0,0 +1,159 @@ +Generative Model Preparation +=============================================================================== + +.. meta:: + :description: Learn how to use Hugging Face Hub and Optimum Intel APIs to + prepare generative models for inference. + + + +Since generative AI models tend to be big and resource-heavy, it is advisable to store them +locally and optimize for efficient inference. This article will show how to prepare +LLM models for inference with OpenVINO by: + +* `Downloading Models from Hugging Face <#download-generative-models-from-hugging-face-hub>`__ +* `Downloading Models from Model Scope <#download-generative-models-from-model-scope>`__ +* `Converting and Optimizing Generative Models <#convert-and-optimize-generative-models>`__ + + + +Download Generative Models From Hugging Face Hub +############################################################################### + +Pre-converted and pre-optimized models are available in the `OpenVINO Toolkit `__ +organization, under the `model section `__, or under +different model collections: + +* `LLM: `__ +* `Speech-to-Text `__ +* `Speculative Decoding Draft Models `__ + +You can also use the **huggingface_hub** package to download models: + +.. code-block:: console + + pip install huggingface_hub + huggingface-cli download "OpenVINO/phi-2-fp16-ov" --local-dir model_path + + +The models can be used in OpenVINO immediately after download. No dependencies +are required except **huggingface_hub**. + + +Download Generative Models From Model Scope +############################################################################### + +To download models from `Model Scope `__, +use the **modelscope** package: + +.. code-block:: console + + pip install modelscope + modelscope download --model "Qwen/Qwen2-7b" --local_dir model_path + +Models downloaded via Model Scope are available in Pytorch format only and they must +be :doc:`converted to OpenVINO IR <../../openvino-workflow/model-preparation/convert-model-to-ir>` +before inference. + +Convert and Optimize Generative Models +############################################################################### + +OpenVINO works best with models in the OpenVINO IR format, both in full precision and quantized. +If your selected model has not been pre-optimized, you can easily do it yourself, using a single +**optimum-cli** command. For that, make sure optimum-intel is installed on your system: + +.. code-block:: console + + pip install optimum-intel[openvino] + + +While optimizing models, you can decide to keep the original precision or select one that is lower. + +.. tab-set:: + + .. tab-item:: Keeping full model precision + :sync: full-precision + + .. code-block:: console + + optimum-cli export openvino --model --weight-format fp16 + + Examples: + + .. tab-set:: + + .. tab-item:: LLM (text generation) + :sync: llm-text-gen + + .. code-block:: console + + optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 ov_llama_2 + + .. tab-item:: Diffusion models (text2image) + :sync: diff-text-img + + .. code-block:: console + + optimum-cli export openvino --model stabilityai/stable-diffusion-xl-base-1.0 --weight-format fp16 ov_SDXL + + .. tab-item:: VLM (Image processing): + :sync: vlm-img-proc + + .. code-block:: console + + optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code –weight-format fp16 ov_MiniCPM-V-2_6 + + .. tab-item:: Whisper models (speech2text): + :sync: whisp-speech-txt + + .. code-block:: console + + optimum-cli export openvino --trust-remote-code --model openai/whisper-base ov_whisper + + .. tab-item:: Exporting to selected precision + :sync: low-precision + + .. code-block:: console + + optimum-cli export openvino --model --weight-format int4 + + Examples: + + .. tab-set:: + + .. tab-item:: LLM (text generation) + :sync: llm-text-gen + + .. code-block:: console + + optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format int4 ov_llama_2 + + .. tab-item:: Diffusion models (text2image) + :sync: diff-text-img + + .. code-block:: console + + optimum-cli export openvino --model stabilityai/stable-diffusion-xl-base-1.0 --weight-format int4 ov_SDXL + + .. tab-item:: VLM (Image processing) + :sync: vlm-img-proc + + .. code-block:: console + + optimum-cli export openvino -m model_path --task text-generation-with-past --weight-format int4 ov_MiniCPM-V-2_6 + + +.. note:: + + Any other ``model_id``, for example ``openbmb/MiniCPM-V-2_6``, or the path + to a local model file can be used. + + Also, you can specify different data type like ``int8``. + + +Additional Resources +############################################################################### + +* `Full set of optimum-cli parameters `__ +* :doc:`Model conversion in OpenVINO <../../openvino-workflow/model-preparation/convert-model-to-ir>` +* :doc:`Model optimization in OpenVINO <../../openvino-workflow/model-optimization>` diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst index a26b670b5314d0..4fec1acd23e6a7 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst @@ -1,4 +1,4 @@ -Run LLMs with Hugging Face and Optimum Intel +Inference with Optimum Intel =============================================================================================== .. meta:: @@ -276,9 +276,10 @@ includes **Dynamic quantization** of activations of 4/8-bit quantized MatMuls an ov_config={"KV_CACHE_PRECISION": "u8", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"} ) -.. note:: + .. note:: + Currently, for KV-cache quantization, GPU ignores the DYNAMIC_QUANTIZATION_GROUP_SIZE property, using ``group_size = head_size``. Additionally, it does not support the ``get_state()`` and ``set_state()`` APIs when KV-cache quantization is enabled. - Currently, both Dynamic quantization and KV-cache quantization are available for CPU device. + For GPU, KV-cache quantization is enabled by default on platforms without XMX support, and can be disabled by setting KV_CACHE_PRECISION to ``undefined``. Working with Models Tuned with LoRA diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst index 2476a0423e30e1..d33ae05f68f462 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst @@ -1,4 +1,4 @@ -Run LLM Inference on Native OpenVINO (not recommended) +Generative AI with Base OpenVINO (not recommended) =============================================================================================== To run Generative AI models using native OpenVINO APIs you need to follow regular diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst index 7b135fa7ff0b14..436d383ebf787e 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst @@ -146,6 +146,8 @@ offer a limited set of supported OpenVINO features. ov::intel_npu::turbo ov::intel_npu::tiles ov::intel_npu::max_tiles + ov::intel_npu::bypass_umd_caching + ov::intel_npu::defer_weights_load .. tab-item:: Read-only properties @@ -168,7 +170,6 @@ offer a limited set of supported OpenVINO features. ov::intel_npu::device_alloc_mem_size ov::intel_npu::device_total_mem_size ov::intel_npu::driver_version - ov::intel_npu::bypass_umd_caching .. note:: diff --git a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst index 6ab924a61ef150..259f605d46c2f7 100644 --- a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst +++ b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst @@ -247,57 +247,50 @@ OpenVINO™ provides several debug capabilities: * Model can be visualized to image from the xDot format: -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py - :language: python - :fragment: [ov:visualize] - - .. tab-item:: C++ - :sync: cpp - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp - :language: cpp - :fragment: [ov:visualize] - - -.. code-block:: sh + .. tab-set:: - `ov::pass::VisualizeTree` can be parametrized via environment variables: + .. tab-item:: Python + :sync: py - OV_VISUALIZE_TREE_OUTPUT_SHAPES=1 - visualize shapes + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py + :language: python + :fragment: [ov:visualize] - OV_VISUALIZE_TREE_OUTPUT_TYPES=1 - visualize types + .. tab-item:: C++ + :sync: cpp - OV_VISUALIZE_TREE_MIN_MAX_DENORMAL=1 - pretty denormal values + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp + :language: cpp + :fragment: [ov:visualize] - OV_VISUALIZE_TREE_RUNTIME_INFO=1 - print runtime information - OV_VISUALIZE_TREE_IO=1 - print I/O ports + ``ov::pass::VisualizeTree`` can be parametrized via environment variables: - OV_VISUALIZE_TREE_MEMBERS_NAME=1 - print member names + * ``OV_VISUALIZE_TREE_OUTPUT_SHAPES=1`` - visualize shapes + * ``OV_VISUALIZE_TREE_OUTPUT_TYPES=1`` - visualize types + * ``OV_VISUALIZE_TREE_MIN_MAX_DENORMAL=1`` - pretty denormal values + * ``OV_VISUALIZE_TREE_RUNTIME_INFO=1`` - print runtime information + * ``OV_VISUALIZE_TREE_IO=1`` - print I/O ports + * ``OV_VISUALIZE_TREE_MEMBERS_NAME=1`` - print member names * Also model can be serialized to IR: -.. tab-set:: + .. tab-set:: - .. tab-item:: Python - :sync: py + .. tab-item:: Python + :sync: py - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py - :language: python - :fragment: [ov:serialize] + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py + :language: python + :fragment: [ov:serialize] - .. tab-item:: C++ - :sync: cpp + .. tab-item:: C++ + :sync: cpp - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp - :language: cpp - :fragment: [ov:serialize] + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp + :language: cpp + :fragment: [ov:serialize] Additional Resources @@ -306,5 +299,3 @@ Additional Resources * :doc:`Available Operation Sets <../../../documentation/openvino-ir-format/operation-sets/available-opsets>`. * :doc:`OpenVINO™ Runtime Extensibility Developer Guide <../../../documentation/openvino-extensibility>`. * :doc:`Transformations Developer Guide <../../../documentation/openvino-extensibility/transformation-api>`. - - diff --git a/docs/nbdoc/consts.py b/docs/nbdoc/consts.py index 1a4d3a13049041..e5c5d4773dce4c 100644 --- a/docs/nbdoc/consts.py +++ b/docs/nbdoc/consts.py @@ -6,7 +6,7 @@ repo_owner = "openvinotoolkit" repo_name = "openvino_notebooks" repo_branch = "tree/main" -artifacts_link = "http://repository.toolbox.iotg.sclab.intel.com/projects/ov-notebook/0.1.0-latest/20241120220806/dist/rst_files/" +artifacts_link = "http://repository.toolbox.iotg.sclab.intel.com/projects/ov-notebook/0.1.0-latest/20241209220902/dist/rst_files/" blacklisted_extensions = ['.xml', '.bin'] notebooks_repo = "https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/" notebooks_binder = "https://mybinder.org/v2/gh/openvinotoolkit/openvino_notebooks/HEAD?filepath=" diff --git a/docs/notebooks/3D-pose-estimation-with-output.rst b/docs/notebooks/3D-pose-estimation-with-output.rst index 9e09d96094fc78..7959bf48a75e45 100644 --- a/docs/notebooks/3D-pose-estimation-with-output.rst +++ b/docs/notebooks/3D-pose-estimation-with-output.rst @@ -113,58 +113,57 @@ Lab instead.** Collecting torch Using cached https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl (194.9 MB) Collecting tqdm - Using cached tqdm-4.67.0-py3-none-any.whl.metadata (57 kB) - Requirement already satisfied: ipywidgets>=7.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (8.1.5) + Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB) + Requirement already satisfied: ipywidgets>=7.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (8.1.5) Collecting ipydatawidgets>=1.1.1 (from pythreejs) Using cached ipydatawidgets-4.3.5-py2.py3-none-any.whl.metadata (1.4 kB) Collecting numpy (from pythreejs) Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB) - Requirement already satisfied: traitlets in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (5.14.3) + Requirement already satisfied: traitlets in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pythreejs) (5.14.3) Collecting openvino-telemetry>=2023.2.1 (from openvino>=2024.4.0) Using cached openvino_telemetry-2024.5.0-py3-none-any.whl.metadata (2.3 kB) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2024.4.0) (24.2) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2024.4.0) (24.2) Collecting filelock (from torch) Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB) - Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) + Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) Collecting sympy (from torch) Using cached sympy-1.13.3-py3-none-any.whl.metadata (12 kB) Collecting networkx (from torch) Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) + Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) Collecting fsspec (from torch) Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB) Collecting traittypes>=0.2.0 (from ipydatawidgets>=1.1.1->pythreejs) Using cached traittypes-0.2.1-py2.py3-none-any.whl.metadata (1.0 kB) - Requirement already satisfied: comm>=0.1.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (0.2.2) - Requirement already satisfied: ipython>=6.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (8.12.3) - Requirement already satisfied: widgetsnbextension~=4.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (4.0.13) - Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (3.0.13) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) + Requirement already satisfied: comm>=0.1.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (0.2.2) + Requirement already satisfied: ipython>=6.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (8.12.3) + Requirement already satisfied: widgetsnbextension~=4.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (4.0.13) + Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipywidgets>=7.2.1->pythreejs) (3.0.13) + Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) INFO: pip is looking at multiple versions of networkx to determine which version is compatible with other requirements. This could take a while. Collecting networkx (from torch) Using cached networkx-3.1-py3-none-any.whl.metadata (5.3 kB) Collecting mpmath<1.4,>=1.1.0 (from sympy->torch) Using cached https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB) - Requirement already satisfied: backcall in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.0) - Requirement already satisfied: decorator in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (5.1.1) - Requirement already satisfied: jedi>=0.16 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.19.2) - Requirement already satisfied: matplotlib-inline in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.1.7) - Requirement already satisfied: pickleshare in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.5) - Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.48) - Requirement already satisfied: pygments>=2.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.18.0) - Requirement already satisfied: stack-data in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.6.3) - Requirement already satisfied: pexpect>4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (4.9.0) - Requirement already satisfied: parso<0.9.0,>=0.8.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.8.4) - Requirement already satisfied: ptyprocess>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.0) - Requirement already satisfied: wcwidth in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.13) - Requirement already satisfied: executing>=1.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.1.0) - Requirement already satisfied: asttokens>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.4.1) - Requirement already satisfied: pure-eval in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.3) - Requirement already satisfied: six>=1.12.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (1.16.0) + Requirement already satisfied: backcall in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.0) + Requirement already satisfied: decorator in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (5.1.1) + Requirement already satisfied: jedi>=0.16 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.19.2) + Requirement already satisfied: matplotlib-inline in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.1.7) + Requirement already satisfied: pickleshare in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.5) + Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.48) + Requirement already satisfied: pygments>=2.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.18.0) + Requirement already satisfied: stack-data in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.6.3) + Requirement already satisfied: pexpect>4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (4.9.0) + Requirement already satisfied: parso<0.9.0,>=0.8.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.8.4) + Requirement already satisfied: ptyprocess>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.7.0) + Requirement already satisfied: wcwidth in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.13) + Requirement already satisfied: executing>=1.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (2.1.0) + Requirement already satisfied: asttokens>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (3.0.0) + Requirement already satisfied: pure-eval in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.2.1->pythreejs) (0.2.3) Using cached pythreejs-2.4.2-py3-none-any.whl (3.4 MB) Using cached openvino-2024.4.0-16579-cp38-cp38-manylinux2014_x86_64.whl (42.6 MB) Using cached opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (62.5 MB) - Using cached tqdm-4.67.0-py3-none-any.whl (78 kB) + Using cached tqdm-4.67.1-py3-none-any.whl (78 kB) Using cached ipydatawidgets-4.3.5-py2.py3-none-any.whl (271 kB) Using cached numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB) Using cached openvino_telemetry-2024.5.0-py3-none-any.whl (23 kB) @@ -174,7 +173,7 @@ Lab instead.** Using cached sympy-1.13.3-py3-none-any.whl (6.2 MB) Using cached traittypes-0.2.1-py2.py3-none-any.whl (8.6 kB) Installing collected packages: openvino-telemetry, mpmath, traittypes, tqdm, sympy, numpy, networkx, fsspec, filelock, torch, openvino, opencv-python, ipydatawidgets, pythreejs - Successfully installed filelock-3.16.1 fsspec-2024.10.0 ipydatawidgets-4.3.5 mpmath-1.3.0 networkx-3.1 numpy-1.24.4 opencv-python-4.10.0.84 openvino-2024.4.0 openvino-telemetry-2024.5.0 pythreejs-2.4.2 sympy-1.13.3 torch-2.4.1+cpu tqdm-4.67.0 traittypes-0.2.1 + Successfully installed filelock-3.16.1 fsspec-2024.10.0 ipydatawidgets-4.3.5 mpmath-1.3.0 networkx-3.1 numpy-1.24.4 opencv-python-4.10.0.84 openvino-2024.4.0 openvino-telemetry-2024.5.0 pythreejs-2.4.2 sympy-1.13.3 torch-2.4.1+cpu tqdm-4.67.1 traittypes-0.2.1 Note: you may need to restart the kernel to use updated packages. @@ -250,7 +249,7 @@ Download the model .. parsed-literal:: - model/human-pose-estimation-3d.tar.gz: 0%| | 0.00/17.6M [00:00`__ tackles the task of generating animation sequences from a single character image. It @@ -34,10 +36,14 @@ Learn more in `GitHub repo `__ and `paper `__. -.. warning:: +.. container:: alert alert-warning - This tutorial requires at least **96 GB** of RAM for model conversion and **40 GB** for inference. Changing the values of ``HEIGHT``, ``WIDTH`` and ``VIDEO_LENGTH`` variables will change the memory consumption but will also affect accuracy. + :: +

! WARNING !

+

+ This tutorial requires at least 96 GB of RAM for model conversion and 40 GB for inference. Changing the values of HEIGHT, WIDTH and VIDEO_LENGTH variables will change the memory consumption but will also affect accuracy. +

**Table of contents:** @@ -70,9 +76,6 @@ need a Jupyter server to start. For details, please refer to `Installation Guide `__. -.. |image0| image:: https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/animate-anyone/animate-anyone.gif - - Prerequisites ------------- @@ -110,13 +113,6 @@ Prerequisites %load_ext skip_kernel_extension - -.. parsed-literal:: - - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - Note that we clone a fork of original repo with tweaked forward methods. .. code:: ipython3 @@ -171,9 +167,11 @@ Note that we clone a fork of original repo with tweaked forward methods. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. + torch.utils._pytree._register_pytree_node( + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. torch.utils._pytree._register_pytree_node( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead. torch.utils._pytree._register_pytree_node( @@ -221,13 +219,6 @@ Prepare base model local_dir=local_dir, ) - - -.. parsed-literal:: - - diffusion_pytorch_model.bin: 0%| | 0.00/3.44G [00:00:2: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - :6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - :9: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - - Convert model to OpenVINO IR ---------------------------- @@ -423,7 +337,7 @@ semantic features are extracted through the CLIP image encoder for Cross-Attention. Temporal-Attention operates in the temporal dimension. Finally, the VAE decoder decodes the result into a video clip. -|image01| +.. image:: https://humanaigc.github.io/animate-anyone/static/images/f2_img.png The pipeline contains 6 PyTorch modules: @@ -463,8 +377,6 @@ compression parameters. More details about weights compression can be found in `OpenVINO documentation `__. -.. |image01| image:: https://humanaigc.github.io/animate-anyone/static/images/f2_img.png - .. code:: ipython3 %%skip not $SHOULD_CONVERT @@ -522,12 +434,14 @@ of the pipeline, it will be better to convert them to separate models. .. parsed-literal:: + WARNING:nncf:NNCF provides best results with torch==2.1.2, while current torch version is 2.2.2+cpu. If you encounter issues, consider switching to torch==2.1.2 INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (32 / 32) │ 100% (32 / 32) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (32 / 32) | 100% (32 / 32) | + +--------------+---------------------------+-----------------------------------+ @@ -543,6 +457,14 @@ of the pipeline, it will be better to convert them to separate models. + + + + + + + + .. code:: ipython3 %%skip not $SHOULD_CONVERT @@ -568,11 +490,12 @@ of the pipeline, it will be better to convert them to separate models. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (40 / 40) │ 100% (40 / 40) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (40 / 40) | 100% (40 / 40) | + +--------------+---------------------------+-----------------------------------+ @@ -588,6 +511,14 @@ of the pipeline, it will be better to convert them to separate models. + + + + + + + + Reference UNet ~~~~~~~~~~~~~~ @@ -634,11 +565,12 @@ step. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (270 / 270) │ 100% (270 / 270) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (270 / 270) | 100% (270 / 270) | + +--------------+---------------------------+-----------------------------------+ @@ -654,6 +586,14 @@ step. + + + + + + + + Denoising UNet ~~~~~~~~~~~~~~ @@ -727,11 +667,12 @@ step. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (534 / 534) │ 100% (534 / 534) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (534 / 534) | 100% (534 / 534) | + +--------------+---------------------------+-----------------------------------+ @@ -747,6 +688,14 @@ step. + + + + + + + + Pose Guider ~~~~~~~~~~~ @@ -773,11 +722,12 @@ efficiently integrate pose control signals into the denoising process. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (8 / 8) │ 100% (8 / 8) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (8 / 8) | 100% (8 / 8) | + +--------------+---------------------------+-----------------------------------+ @@ -793,6 +743,14 @@ efficiently integrate pose control signals into the denoising process. + + + + + + + + Image Encoder ~~~~~~~~~~~~~ @@ -818,19 +776,19 @@ required for both reference and denoising UNets. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /home/itrushkin/.virtualenvs/test/lib/python3.10/site-packages/transformers/modeling_utils.py:4225: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( - `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. .. parsed-literal:: INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (146 / 146) │ 100% (146 / 146) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + +--------------+---------------------------+-----------------------------------+ + | Num bits (N) | % all parameters (layers) | % ratio-defining parameters | + | | | (layers) | + +==============+===========================+===================================+ + | 8 | 100% (146 / 146) | 100% (146 / 146) | + +--------------+---------------------------+-----------------------------------+ @@ -846,6 +804,14 @@ required for both reference and denoising UNets. + + + + + + + + Inference --------- @@ -871,6 +837,15 @@ For starting work, please select inference device from dropdown list. device = device_widget() + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=5, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'GPU.3', 'AUTO'), value='A… + + + .. code:: ipython3 class OVPose2VideoPipeline(Pose2VideoPipeline): @@ -1168,7 +1143,7 @@ Video post-processing .. raw:: html @@ -1242,23 +1217,9 @@ Interactive inference demo = make_demo(fn=generate) try: - demo.queue().launch(debug=False) + demo.queue().launch(debug=True) except Exception: - demo.queue().launch(debug=False, share=True) + demo.queue().launch(debug=True, share=True) # if you are launching remotely, specify server_name and server_port # demo.launch(server_name='your server name', server_port='server port in int') # Read more in the docs: https://gradio.app/docs/" - - -.. parsed-literal:: - - Running on local URL: http://127.0.0.1:7860 - - To create a public link, set `share=True` in `launch()`. - - - - - - - diff --git a/docs/notebooks/async-api-with-output.rst b/docs/notebooks/async-api-with-output.rst index 9f9130a4fe0db2..86ff1bc0aa9b0c 100644 --- a/docs/notebooks/async-api-with-output.rst +++ b/docs/notebooks/async-api-with-output.rst @@ -59,7 +59,14 @@ Imports .. code:: ipython3 %pip install -q "openvino>=2023.1.0" - %pip install -q opencv-python "matplotlib>=3.4" + %pip install -q opencv-python tqdm "matplotlib>=3.4" + + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + .. code:: ipython3 @@ -97,26 +104,36 @@ the person in each frame of the video. .. code:: ipython3 + from pathlib import Path + # directory where model will be downloaded base_model_dir = "model" # model name as named in Open Model Zoo model_name = "person-detection-0202" precision = "FP16" - model_path = f"model/intel/{model_name}/{precision}/{model_name}.xml" - download_command = f"omz_downloader " f"--name {model_name} " f"--precision {precision} " f"--output_dir {base_model_dir} " f"--cache_dir {base_model_dir}" - ! $download_command + model_path = Path("model") / f"{model_name}.xml" + + base_model_url = "https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1" + + if not Path(model_path).exists(): + utils.download_file(f"{base_model_url}/{model_name}/{precision}/{model_name}.xml", filename=model_path.name, directory=model_path.parent) + utils.download_file( + f"{base_model_url}/{model_name}/{precision}/{model_name}.bin", filename=model_path.name.replace(".xml", ".bin"), directory=model_path.parent + ) + .. parsed-literal:: - ################|| Downloading person-detection-0202 ||################ - - ========== Retrieving model/intel/person-detection-0202/FP16/person-detection-0202.xml from the cache - - ========== Retrieving model/intel/person-detection-0202/FP16/person-detection-0202.bin from the cache - - + person-detection-0202.xml: 0%| | 0.00/249k [00:00=4.30.2", "controlnet-aux>=0.0.6", "gradio>=3.36", + "datasets>=2.14.6", + "nncf>=2.7.0", + "opencv-python", "--extra-index-url", "https://download.pytorch.org/whl/cpu", ) - pip_install("openvino>=2023.1.0", "datasets>=2.14.6", "nncf>=2.7.0", "opencv-python") + pip_install("openvino>=2023.1.0") Instantiating Generation Pipeline --------------------------------- @@ -296,7 +299,7 @@ Now, let us check its result on example image: image_path = Path("example_image.jpg") if not image_path.exists(): - download_file(image_path, filename="example_image.jpg") + download_file(example_url, filename="example_image.jpg") img = Image.open(image_path) pose = pose_estimator(img) diff --git a/docs/notebooks/convert-to-openvino-with-output.rst b/docs/notebooks/convert-to-openvino-with-output.rst index 507dd407eae739..bc5a45f244e376 100644 --- a/docs/notebooks/convert-to-openvino-with-output.rst +++ b/docs/notebooks/convert-to-openvino-with-output.rst @@ -184,10 +184,10 @@ NLP model from Hugging Face and export it in ONNX format: .. parsed-literal:: - 2024-11-22 00:16:16.864961: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:16:16.903350: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:09:00.018226: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:09:00.080568: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:16:17.575066: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:09:00.743048: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT .. parsed-literal:: @@ -660,7 +660,7 @@ frameworks conversion guides. .. parsed-literal:: - 2024-11-22 00:16:33.997234: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. + 2024-12-09 23:09:17.262024: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. Skipping registering GPU devices... diff --git a/docs/notebooks/convnext-classification-with-output.rst b/docs/notebooks/convnext-classification-with-output.rst index 9466f30c22898e..1204ea2c17f106 100644 --- a/docs/notebooks/convnext-classification-with-output.rst +++ b/docs/notebooks/convnext-classification-with-output.rst @@ -192,7 +192,7 @@ And print results Predicted Class: 281 Predicted Label: n02123045 tabby, tabby cat - Predicted Probability: 0.5919997096061707 + Predicted Probability: 0.5351971983909607 Convert the model to OpenVINO Intermediate representation format diff --git a/docs/notebooks/cross-lingual-books-alignment-with-output.rst b/docs/notebooks/cross-lingual-books-alignment-with-output.rst index b116f0e1f5cda1..68f51ad137ff16 100644 --- a/docs/notebooks/cross-lingual-books-alignment-with-output.rst +++ b/docs/notebooks/cross-lingual-books-alignment-with-output.rst @@ -32,7 +32,7 @@ Prerequisites - ``requests`` - for getting books - ``pysbd`` - for splitting sentences -- ``transformers[torch]`` and ``openvino_dev`` - for getting sentence +- ``transformers[torch]`` and ``openvino`` - for getting sentence embeddings - ``seaborn`` - for alignment matrix visualization - ``ipywidgets`` - for displaying HTML and JS output in the notebook @@ -416,12 +416,12 @@ languages. It has the same architecture as the BERT model but has been trained on a different task: to produce identical embeddings for translation pairs. -|image01| +|image02| This makes LaBSE a great choice for our task and it can be reused for different language pairs still producing good results. -.. |image01| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/627d3a39-7076-479f-a7b1-392f49a0b83e +.. |image02| image:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/627d3a39-7076-479f-a7b1-392f49a0b83e .. code:: ipython3 diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst b/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst index c3d645f1841a17..853da533385284 100644 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst +++ b/docs/notebooks/ct-segmentation-quantize-nncf-with-output.rst @@ -154,10 +154,10 @@ Imports .. parsed-literal:: - 2024-11-22 00:16:56.689204: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:16:56.724390: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:09:41.789833: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:09:41.824673: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:16:57.319913: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:09:42.418712: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT .. parsed-literal:: @@ -213,7 +213,7 @@ notebook `__. .. parsed-literal:: - pretrained_model/unet_kits19_state_dict.pth: 0%| | 0.00/7.58M [00:00`__. .. parsed-literal:: - /tmp/ipykernel_3514722/1592321960.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /tmp/ipykernel_2165966/1592321960.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. state_dict = torch.load(state_dict_file, map_location=torch.device("cpu")) @@ -444,7 +444,7 @@ this notebook. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if x_e.shape[-i - 1] != x_0.shape[-i - 1]: @@ -526,18 +526,18 @@ Convert quantized model to OpenVINO IR model and save it. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:340: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:340: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! return self._level_low.item() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:348: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/quantization/layers.py:348: TracerWarning: Converting a tensor to a Python number might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! return self._level_high.item() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/monai/networks/nets/basic_unet.py:168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if x_e.shape[-i - 1] != x_0.shape[-i - 1]: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: Tensor-likes are not close! - Mismatched elements: 247214 / 262144 (94.3%) - Greatest absolute difference: 4.1846349239349365 at index (0, 0, 379, 430) (up to 1e-05 allowed) - Greatest relative difference: 15984.079041034269 at index (0, 0, 447, 390) (up to 1e-05 allowed) + Mismatched elements: 250458 / 262144 (95.5%) + Greatest absolute difference: 3.8674159049987793 at index (0, 0, 351, 76) (up to 1e-05 allowed) + Greatest relative difference: 12206.866810726728 at index (0, 0, 144, 31) (up to 1e-05 allowed) _check_trace( @@ -663,7 +663,7 @@ be run in the notebook with ``! benchmark_app`` or [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 8.99 ms + [ INFO ] Read model took 8.90 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] x (node: x) : f32 / [...] / [1,1,512,512] @@ -677,7 +677,7 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Model outputs: [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 240.78 ms + [ INFO ] Compile model took 264.91 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -714,17 +714,17 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Fill input 'x' with random values [Step 10/11] Measuring performance (Start inference synchronously, limits: 15000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 49.70 ms + [ INFO ] First inference took 48.49 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 425 iterations - [ INFO ] Duration: 15023.51 ms + [ INFO ] Count: 431 iterations + [ INFO ] Duration: 15002.34 ms [ INFO ] Latency: - [ INFO ] Median: 34.55 ms - [ INFO ] Average: 35.13 ms - [ INFO ] Min: 34.21 ms - [ INFO ] Max: 47.23 ms - [ INFO ] Throughput: 28.29 FPS + [ INFO ] Median: 34.52 ms + [ INFO ] Average: 34.59 ms + [ INFO ] Min: 34.20 ms + [ INFO ] Max: 36.19 ms + [ INFO ] Throughput: 28.73 FPS .. code:: ipython3 @@ -750,7 +750,7 @@ be run in the notebook with ``! benchmark_app`` or [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.LATENCY. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 11.10 ms + [ INFO ] Read model took 10.56 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] x (node: x) : f32 / [...] / [1,1,512,512] @@ -764,7 +764,7 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Model outputs: [ INFO ] ***NO_NAME*** (node: __module.final_conv/aten::_convolution/Add) : f32 / [...] / [1,1,512,512] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 251.41 ms + [ INFO ] Compile model took 248.98 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model49 @@ -801,17 +801,17 @@ be run in the notebook with ``! benchmark_app`` or [ INFO ] Fill input 'x' with random values [Step 10/11] Measuring performance (Start inference synchronously, limits: 15000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 29.68 ms + [ INFO ] First inference took 29.18 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 911 iterations - [ INFO ] Duration: 15009.49 ms + [ INFO ] Count: 908 iterations + [ INFO ] Duration: 15011.20 ms [ INFO ] Latency: - [ INFO ] Median: 15.73 ms - [ INFO ] Average: 16.27 ms - [ INFO ] Min: 15.41 ms - [ INFO ] Max: 24.40 ms - [ INFO ] Throughput: 60.69 FPS + [ INFO ] Median: 15.48 ms + [ INFO ] Average: 16.33 ms + [ INFO ] Min: 15.15 ms + [ INFO ] Max: 28.31 ms + [ INFO ] Throughput: 60.49 FPS Visually Compare Inference Results @@ -905,7 +905,7 @@ seed is displayed to enable reproducing specific runs of this cell. .. parsed-literal:: - Visualizing results with seed 1732231099 + Visualizing results with seed 1733782265 @@ -989,7 +989,7 @@ performs inference, and displays the results on the frames loaded in .. parsed-literal:: Loaded model to AUTO in 0.15 seconds. - Total time for 68 frames: 2.31 seconds, fps:29.91 + Total time for 68 frames: 2.32 seconds, fps:29.70 References diff --git a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png b/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png index a0c854d6dd33f6..fc10c642d8d2a1 100644 --- a/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png +++ b/docs/notebooks/ct-segmentation-quantize-nncf-with-output_files/ct-segmentation-quantize-nncf-with-output_37_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:588fb52eb7dcf0ede69419b9645ad6dc93526e8960af83679e12bac98e6817f6 -size 385527 +oid sha256:52955890ed558e516a361399057b8529ffd5103a7b63ed20a2549062b4d900b5 +size 386283 diff --git a/docs/notebooks/ddcolor-image-colorization-with-output.rst b/docs/notebooks/ddcolor-image-colorization-with-output.rst index cd3bf024065b55..11b92fe4897f5e 100644 --- a/docs/notebooks/ddcolor-image-colorization-with-output.rst +++ b/docs/notebooks/ddcolor-image-colorization-with-output.rst @@ -135,7 +135,7 @@ Prerequisites .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) @@ -416,10 +416,10 @@ Perform model quantization .. parsed-literal:: - 2024-11-22 00:20:47.511999: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:20:47.551328: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:13:28.920989: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:13:28.960154: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:20:47.960841: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:13:29.365051: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -527,7 +527,7 @@ Tool + @@ -314,13 +313,13 @@ loading on device using ``core.complie_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:73: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:73: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:74: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:74: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2.py:183: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dinov2.py:183: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if npatch == N and w == h: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dpt.py:147: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/Depth-Anything-V2/depth_anything_v2/dpt.py:147: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True) @@ -412,7 +411,7 @@ range. .. parsed-literal:: - + @@ -429,11 +428,10 @@ Run inference on video VIDEO_FILE = "./Coco Walking in Berkeley.mp4" - if not Path(VIDEO_FILE).exists(): - download_file( - "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", - VIDEO_FILE, - ) + download_file( + "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4", + VIDEO_FILE, + ) # Number of seconds of input video to process. Set `NUM_SECONDS` to 0 to process # the full video. @@ -636,7 +634,7 @@ Run inference on video .. parsed-literal:: - Processed 60 frames in 13.34 seconds. Total FPS (including video processing): 4.50.Inference FPS: 10.65 + Processed 60 frames in 13.15 seconds. Total FPS (including video processing): 4.56.Inference FPS: 10.69 Video saved to 'output/Coco Walking in Berkeley_depth_anything.mp4'. @@ -663,7 +661,7 @@ Run inference on video .. parsed-literal:: Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 If you cannot see the video in your browser, please click on the following link to download the video @@ -797,10 +795,10 @@ quantization code below may take some time. .. parsed-literal:: - 2024-11-22 00:29:02.540402: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:29:02.574640: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-09 23:21:25.394147: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-09 23:21:25.427427: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:29:03.160362: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-09 23:21:26.001101: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -926,10 +924,10 @@ data. .. parsed-literal:: - Processed 60 frames in 12.91 seconds. Total FPS (including video processing): 4.65.Inference FPS: 12.59 + Processed 60 frames in 12.60 seconds. Total FPS (including video processing): 4.76.Inference FPS: 13.15 Video saved to 'output/Coco Walking in Berkeley_depth_anything_int8.mp4'. Showing video saved at - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/depth-anything/output/Coco Walking in Berkeley_depth_anything.mp4 If you cannot see the video in your browser, please click on the following link to download the video @@ -1009,9 +1007,9 @@ Tool =4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) - Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (1.13.3) - Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) - Requirement already satisfied: fsspec in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (2024.9.0) - Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (1.23.5) - Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (10.4.0) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) - Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch) (1.3.0) + Requirement already satisfied: torch in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2.4.1+cpu) + Requirement already satisfied: torchvision in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.19.1+cpu) + Requirement already satisfied: opencv-python in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (4.10.0.84) + Requirement already satisfied: wheel in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.45.1) + Requirement already satisfied: filelock in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.16.1) + Requirement already satisfied: typing-extensions>=4.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (4.12.2) + Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (1.13.3) + Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1) + Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (3.1.4) + Requirement already satisfied: fsspec in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch) (2024.9.0) + Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (1.23.5) + Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchvision) (10.4.0) + Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch) (2.1.5) + Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch) (1.3.0) Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu Collecting git+https://github.com/facebookresearch/detectron2.git - Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-4klmx21d + Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-89enhchj .. parsed-literal:: - Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-4klmx21d + Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-89enhchj .. parsed-literal:: @@ -125,73 +125,73 @@ Install required packages for running model Resolved https://github.com/facebookresearch/detectron2.git to commit c69939aa85460e8135f40bce908a6cddaa73065f Preparing metadata (setup.py): started Preparing metadata (setup.py): finished with status 'done' - Requirement already satisfied: Pillow>=7.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (10.4.0) - Requirement already satisfied: black in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.3.0) - Requirement already satisfied: cloudpickle in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.1.0) - Requirement already satisfied: fvcore<0.1.6,>=0.1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.5.post20221221) + Requirement already satisfied: Pillow>=7.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (10.4.0) + Requirement already satisfied: black in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.3.0) + Requirement already satisfied: cloudpickle in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.1.0) + Requirement already satisfied: fvcore<0.1.6,>=0.1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.5.post20221221) Collecting hydra-core>=1.1 (from detectron2==0.6) Using cached hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB) Collecting iopath<0.1.10,>=0.1.7 (from detectron2==0.6) Using cached https://download.pytorch.org/whl/iopath-0.1.9-py3-none-any.whl (27 kB) - Requirement already satisfied: matplotlib in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.7.5) + Requirement already satisfied: matplotlib in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (3.7.5) Collecting omegaconf<2.4,>=2.1 (from detectron2==0.6) Using cached omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.2) - Requirement already satisfied: pycocotools>=2.0.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.0.7) - Requirement already satisfied: tabulate in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.9.0) - Requirement already satisfied: tensorboard in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.12.3) - Requirement already satisfied: termcolor>=1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.4.0) - Requirement already satisfied: tqdm>4.29.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (4.67.0) - Requirement already satisfied: yacs>=0.1.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.8) - Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (1.23.5) - Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (6.0.2) - Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (4.9.3) - Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (6.4.5) - Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath<0.1.10,>=0.1.7->detectron2==0.6) (3.0.0) - Requirement already satisfied: contourpy>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.1.1) - Requirement already satisfied: cycler>=0.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (0.12.1) - Requirement already satisfied: fonttools>=4.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (4.55.0) - Requirement already satisfied: kiwisolver>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.4.7) - Requirement already satisfied: pyparsing>=2.3.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (3.1.4) - Requirement already satisfied: python-dateutil>=2.7 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (2.9.0.post0) - Requirement already satisfied: click>=8.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (8.1.7) - Requirement already satisfied: mypy-extensions>=0.4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (1.0.0) - Requirement already satisfied: pathspec>=0.9.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (0.12.1) - Requirement already satisfied: platformdirs>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.3.6) - Requirement already satisfied: tomli>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (2.1.0) - Requirement already satisfied: typing-extensions>=4.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.12.2) - Requirement already satisfied: absl-py>=0.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.4.0) - Requirement already satisfied: grpcio>=1.48.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.68.0) - Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.36.0) - Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.0.0) - Requirement already satisfied: markdown>=2.6.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.7) - Requirement already satisfied: protobuf>=3.19.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.20.3) - Requirement already satisfied: requests<3,>=2.21.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.32.3) - Requirement already satisfied: setuptools>=41.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (44.0.0) - Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.7.2) - Requirement already satisfied: werkzeug>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.0.6) - Requirement already satisfied: wheel>=0.26 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.45.0) - Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (5.5.0) - Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.4.1) - Requirement already satisfied: rsa<5,>=3.1.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (4.9) - Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (2.0.0) - Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.1->detectron2==0.6) (3.20.2) - Requirement already satisfied: importlib-metadata>=4.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from markdown>=2.6.8->tensorboard->detectron2==0.6) (8.5.0) - Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib->detectron2==0.6) (1.16.0) - Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.4.0) - Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.10) - Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2.2.3) - Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2024.8.30) - Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from werkzeug>=1.0.1->tensorboard->detectron2==0.6) (2.1.5) - Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.6.1) - Requirement already satisfied: oauthlib>=3.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (3.2.2) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (24.2) + Requirement already satisfied: pycocotools>=2.0.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.0.7) + Requirement already satisfied: tabulate in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.9.0) + Requirement already satisfied: tensorboard in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.12.3) + Requirement already satisfied: termcolor>=1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (2.4.0) + Requirement already satisfied: tqdm>4.29.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (4.67.1) + Requirement already satisfied: yacs>=0.1.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from detectron2==0.6) (0.1.8) + Requirement already satisfied: numpy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (1.23.5) + Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fvcore<0.1.6,>=0.1.5->detectron2==0.6) (6.0.2) + Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (4.9.3) + Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.1->detectron2==0.6) (6.4.5) + Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath<0.1.10,>=0.1.7->detectron2==0.6) (3.0.0) + Requirement already satisfied: contourpy>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.1.1) + Requirement already satisfied: cycler>=0.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (0.12.1) + Requirement already satisfied: fonttools>=4.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (4.55.2) + Requirement already satisfied: kiwisolver>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (1.4.7) + Requirement already satisfied: pyparsing>=2.3.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (3.1.4) + Requirement already satisfied: python-dateutil>=2.7 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from matplotlib->detectron2==0.6) (2.9.0.post0) + Requirement already satisfied: click>=8.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (8.1.7) + Requirement already satisfied: mypy-extensions>=0.4.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (1.0.0) + Requirement already satisfied: pathspec>=0.9.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (0.12.1) + Requirement already satisfied: platformdirs>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.3.6) + Requirement already satisfied: tomli>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (2.2.1) + Requirement already satisfied: typing-extensions>=4.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from black->detectron2==0.6) (4.12.2) + Requirement already satisfied: absl-py>=0.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.4.0) + Requirement already satisfied: grpcio>=1.48.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.68.1) + Requirement already satisfied: google-auth<3,>=1.6.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.36.0) + Requirement already satisfied: google-auth-oauthlib<1.1,>=0.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (1.0.0) + Requirement already satisfied: markdown>=2.6.8 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.7) + Requirement already satisfied: protobuf>=3.19.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.20.3) + Requirement already satisfied: requests<3,>=2.21.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (2.32.3) + Requirement already satisfied: setuptools>=41.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (44.0.0) + Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.7.2) + Requirement already satisfied: werkzeug>=1.0.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (3.0.6) + Requirement already satisfied: wheel>=0.26 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from tensorboard->detectron2==0.6) (0.45.1) + Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (5.5.0) + Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.4.1) + Requirement already satisfied: rsa<5,>=3.1.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (4.9) + Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (2.0.0) + Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.1->detectron2==0.6) (3.20.2) + Requirement already satisfied: importlib-metadata>=4.4 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from markdown>=2.6.8->tensorboard->detectron2==0.6) (8.5.0) + Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib->detectron2==0.6) (1.17.0) + Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.4.0) + Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (3.10) + Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2.2.3) + Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests<3,>=2.21.0->tensorboard->detectron2==0.6) (2024.8.30) + Requirement already satisfied: MarkupSafe>=2.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from werkzeug>=1.0.1->tensorboard->detectron2==0.6) (2.1.5) + Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->detectron2==0.6) (0.6.1) + Requirement already satisfied: oauthlib>=3.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<1.1,>=0.5->tensorboard->detectron2==0.6) (3.2.2) Using cached hydra_core-1.3.2-py3-none-any.whl (154 kB) Using cached omegaconf-2.3.0-py3-none-any.whl (79 kB) Building wheels for collected packages: detectron2 Building wheel for detectron2 (setup.py): started Building wheel for detectron2 (setup.py): finished with status 'done' - Created wheel for detectron2: filename=detectron2-0.6-cp38-cp38-linux_x86_64.whl size=8313367 sha256=4eb79589c47d522c993509a8f16dfbf494af0f494c6a73577d9d3668c1ee4a05 - Stored in directory: /tmp/pip-ephem-wheel-cache-mkdcktsx/wheels/19/ac/65/e48e5e4ec2702274d927c5a6efb75709b24014371d3bb778f2 + Created wheel for detectron2: filename=detectron2-0.6-cp38-cp38-linux_x86_64.whl size=8313251 sha256=a744a8ccf54176a60e63af7e14e6a7f431f5b19935a3c1260a7d39f7a7f84bc8 + Stored in directory: /tmp/pip-ephem-wheel-cache-cb2ga2gq/wheels/19/ac/65/e48e5e4ec2702274d927c5a6efb75709b24014371d3bb778f2 Successfully built detectron2 Installing collected packages: omegaconf, iopath, hydra-core, detectron2 Attempting uninstall: omegaconf @@ -203,10 +203,10 @@ Install required packages for running model Uninstalling iopath-0.1.10: Successfully uninstalled iopath-0.1.10 Successfully installed detectron2-0.6 hydra-core-1.3.2 iopath-0.1.9 omegaconf-2.3.0 - Requirement already satisfied: openvino>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2024.4.0) - Requirement already satisfied: numpy<2.1.0,>=1.16.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (1.23.5) - Requirement already satisfied: openvino-telemetry>=2023.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (2024.5.0) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (24.2) + Requirement already satisfied: openvino>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2024.4.0) + Requirement already satisfied: numpy<2.1.0,>=1.16.6 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (1.23.5) + Requirement already satisfied: openvino-telemetry>=2023.2.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (2024.5.0) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from openvino>=2023.1.0) (24.2) Define helpers for PyTorch model initialization and conversion diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg index 2c18ecdc61719a..9ffd8dbc558859 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edc1fd6c9bb94b1ff9dde163988de0d5635f35a9cb918138eb058de89fe36b6c -size 58029 +oid sha256:ec1aaa179217e234b7c93d22f9da2f1ac0281e5bf8e4271c4094c3d680793782 +size 58047 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png index 0890e13959d7b2..0c626a2f115cc3 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_22_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b54cfa3647ce484120c2dac840789885273b1a61d0fdf6fd1fdb93e78753c114 -size 509016 +oid sha256:5ae2588579f79d5d3e23a9fd9870f28f3bd063b9166da901cd639f16f0f04fca +size 508747 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg index d2b1ec1ee92784..6063ffe4fca6ec 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ffdd1e786238678562e14aa201c2a602b1733bb7db8b1c175f7d86b3c011fa2 -size 54276 +oid sha256:d414af832026a73bf7d8a8165a202c499989ddbc4db0826e6e0ca1951b2b4605 +size 54234 diff --git a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png index d970f117246904..656018b2fa8884 100644 --- a/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png +++ b/docs/notebooks/detectron2-to-openvino-with-output_files/detectron2-to-openvino-with-output_32_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b8a9ccae3ca190acfaa9ddaa9be7641e02edae972b15c49f21cf9a8de9ae454 -size 456077 +oid sha256:8aa8664400b8c51d604d23e669d6e44f766d4eb6b9958d38f4757d5e1cbefe88 +size 457666 diff --git a/docs/notebooks/dynamicrafter-animating-images-with-output.rst b/docs/notebooks/dynamicrafter-animating-images-with-output.rst index 13b4c9475f7092..fac46c9f3e6cad 100644 --- a/docs/notebooks/dynamicrafter-animating-images-with-output.rst +++ b/docs/notebooks/dynamicrafter-animating-images-with-output.rst @@ -151,6 +151,13 @@ Prerequisites %pip install -q "openvino>=2024.2.0" "nncf>=2.11.0" "datasets>=2.20.0" %pip install -q "gradio>=4.19" omegaconf einops pytorch_lightning kornia "open_clip_torch==2.22.0" transformers av opencv-python "torch==2.2.2" --extra-index-url https://download.pytorch.org/whl/cpu + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + + .. code:: ipython3 from pathlib import Path @@ -168,6 +175,15 @@ Prerequisites ) open("notebook_utils.py", "w").write(r.text) + + + +.. parsed-literal:: + + 24624 + + + .. code:: ipython3 from cmd_helper import clone_repo @@ -175,6 +191,15 @@ Prerequisites clone_repo("https://github.com/Doubiiu/DynamiCrafter.git", "26e665cd6c174234238d2ded661e2e56f875d360") + + + +.. parsed-literal:: + + PosixPath('DynamiCrafter') + + + Load and run the original pipeline ---------------------------------- @@ -238,7 +263,7 @@ We will use model for 256x256 resolution as example. Also, models for hf_hub_download(repo_id=REPO_ID, filename="model.ckpt", local_dir="./checkpoints/dynamicrafter_256_v1/", local_dir_use_symlinks=False) ckpt_path = "checkpoints/dynamicrafter_256_v1/model.ckpt" - config_file = "dynamicrafter/configs/inference_256_v1.0.yaml" + config_file = "DynamiCrafter/configs/inference_256_v1.0.yaml" config = OmegaConf.load(config_file) model_config = config.pop("model", OmegaConf.create()) model_config["params"]["unet_config"]["params"]["use_checkpoint"] = False @@ -252,11 +277,56 @@ We will use model for 256x256 resolution as example. Also, models for model = download_model() +.. parsed-literal:: + + Note: switching to '26e665cd6c174234238d2ded661e2e56f875d360'. + + You are in 'detached HEAD' state. You can look around, make experimental + changes and commit them, and you can discard any commits you make in this + state without impacting any branches by switching back to a branch. + + If you want to create a new branch to retain commits you create, you may + do so (now or later) by using -c with the switch command. Example: + + git switch -c + + Or undo this operation with: + + git switch - + + Turn off this advice by setting config variable advice.detachedHead to false + + HEAD is now at 26e665c add dataset + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/huggingface_hub/file_download.py:1204: UserWarning: `local_dir_use_symlinks` parameter is deprecated and will be ignored. The process to download files to a local folder has been updated and do not rely on symlinks anymore. You only need to pass a destination folder as`local_dir`. + For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder. + warnings.warn( + + + +.. parsed-literal:: + + model.ckpt: 0%| | 0.00/10.4G [00:00>> model checkpoint loaded. - + Convert the model to OpenVINO IR -------------------------------- @@ -333,6 +403,17 @@ Convert CLIP text encoder del cond_stage_model gc.collect(); + +.. parsed-literal:: + + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + +.. parsed-literal:: + + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + Convert CLIP image encoder ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -355,6 +436,49 @@ resolutions. del model.embedder gc.collect(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/utils/image.py:226: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if input.numel() == 0: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:573: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if size == input_size: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:579: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + antialias = antialias and (max(factors) > 1) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:581: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if antialias: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:584: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + sigmas = (max((factors[0] - 1.0) / 2.0, 0.001), max((factors[1] - 1.0) / 2.0, 0.001)) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:589: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/geometry/transform/affwarp.py:589: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3)) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/gaussian.py:55: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + sigma = tensor([sigma], device=input.device, dtype=input.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/gaussian.py:55: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + sigma = tensor([sigma], device=input.device, dtype=input.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/core/check.py:78: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if x_shape_to_check[i] != dim: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/filters/kernels.py:92: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + mean = tensor([[mean]], device=sigma.device, dtype=sigma.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:101: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if len(mean.shape) == 0 or mean.shape[0] == 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if len(std.shape) == 0 or std.shape[0] == 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:107: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if mean.shape and mean.shape[0] != 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:108: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if mean.shape[0] != data.shape[1] and mean.shape[:2] != data.shape[:2]: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:112: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if std.shape and std.shape[0] != 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:113: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if std.shape[0] != data.shape[1] and std.shape[:2] != data.shape[:2]: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:116: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + mean = torch.as_tensor(mean, device=data.device, dtype=data.dtype) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/kornia/enhance/normalize.py:117: TracerWarning: torch.as_tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + std = torch.as_tensor(std, device=data.device, dtype=data.dtype) + + Convert AE encoder ~~~~~~~~~~~~~~~~~~ @@ -377,6 +501,13 @@ Convert AE encoder del model.first_stage_model.encoder gc.collect(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/ae_modules.py:67: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + w_ = w_ * (int(c)**(-0.5)) + + Convert Diffusion U-Net model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -414,6 +545,21 @@ Convert Diffusion U-Net model del model.model.diffusion_model gc.collect(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:556: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if l_context == 77 + t*16: ## !!! HARD CODE here + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:205: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if batch_size: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:232: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if self.use_temporal_conv and batch_size: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:76: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + assert x.shape[1] == self.channels + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/dynamicrafter-animating-images/DynamiCrafter/lvdm/modules/networks/openaimodel3d.py:99: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + assert x.shape[1] == self.channels + + Convert AE decoder ~~~~~~~~~~~~~~~~~~ @@ -785,7 +931,7 @@ Run OpenVINO pipeline inference .. code:: ipython3 - image_path = "dynamicrafter/prompts/256/art.png" + image_path = "DynamiCrafter/prompts/256/art.png" prompt = "man fishing in a boat at sunset" seed = 234 image = Image.open(image_path) @@ -797,15 +943,15 @@ Run OpenVINO pipeline inference .. parsed-literal:: Seed set to 234 - /tmp/ipykernel_971108/2451984876.py:25: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.) + /tmp/ipykernel_2173449/2451984876.py:25: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.) img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device) - + .. parsed-literal:: - start: man fishing in a boat at sunset 2024-08-06 13:54:24 - Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 164.28 seconds - + start: man fishing in a boat at sunset 2024-12-09 23:46:36 + Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 194.37 seconds + .. code:: ipython3 @@ -828,7 +974,7 @@ Run OpenVINO pipeline inference - + @@ -1000,6 +1146,19 @@ To collect intermediate model inputs for calibration we should customize 0%| | 0/300 [00:00>> model checkpoint loaded. - + .. code:: ipython3 %%skip not $to_quantize.value - image_path = "dynamicrafter/prompts/256/art.png" + image_path = "DynamiCrafter/prompts/256/art.png" prompt = "man fishing in a boat at sunset" seed = 234 image = Image.open(image_path) @@ -1317,13 +1375,13 @@ Let’s run the optimized pipeline .. parsed-literal:: Seed set to 234 - + .. parsed-literal:: - start: man fishing in a boat at sunset 2024-08-06 15:09:26 - Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 81.47 seconds - + start: man fishing in a boat at sunset 2024-12-10 01:17:34 + Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 98.80 seconds + .. code:: ipython3 @@ -1345,7 +1403,7 @@ Let’s run the optimized pipeline - + Compare model file sizes @@ -1373,7 +1431,7 @@ Compare model file sizes encoder_first_stage_ir compression rate: 3.986 embedder_ir compression rate: 3.977 model_ir compression rate: 3.981 - + Compare inference time of the FP32 and INT8 models ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1427,10 +1485,10 @@ models, we use median inference time on calibration subset. .. parsed-literal:: - FP32 latency: 162.304 - INT8 latency: 79.590 - Performance speed up: 2.039 - + FP32 latency: 193.524 + INT8 latency: 97.073 + Performance speed up: 1.994 + Interactive inference --------------------- @@ -1454,6 +1512,15 @@ to launch the interactive demo. use_quantized_models + + + +.. parsed-literal:: + + Checkbox(value=True, description='Use quantized models') + + + .. code:: ipython3 from functools import partial @@ -1472,9 +1539,23 @@ to launch the interactive demo. demo = make_demo(fn=get_image_fn) try: - demo.queue().launch(debug=True) + demo.queue().launch(debug=False) except Exception: - demo.queue().launch(debug=True, share=True) + demo.queue().launch(debug=False, share=True) # if you are launching remotely, specify server_name and server_port # demo.launch(server_name='your server name', server_port='server port in int') # Read more in the docs: https://gradio.app/docs/ + + +.. parsed-literal:: + + Running on local URL: http://127.0.0.1:7860 + + To create a public link, set `share=True` in `launch()`. + + + + + + + diff --git a/docs/notebooks/efficient-sam-with-output.rst b/docs/notebooks/efficient-sam-with-output.rst index 2341db94e22f68..ce83a3675d1d8c 100644 --- a/docs/notebooks/efficient-sam-with-output.rst +++ b/docs/notebooks/efficient-sam-with-output.rst @@ -82,11 +82,22 @@ Prerequisites .. code:: ipython3 - %pip install -q "openvino>=2023.3.0" "nncf>=2.7.0" opencv-python "gradio>=4.13" "matplotlib>=3.4" torch torchvision tqdm --extra-index-url https://download.pytorch.org/whl/cpu + import platform + + %pip install -q "openvino>=2024.5.0" "nncf>=2.14.0" + %pip install -q "torch>=2.2.0" "torchaudio>=2.2.0" "torchvision>=0.17.0" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q opencv-python "gradio>=4.13" "matplotlib>=3.4" tqdm + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" .. parsed-literal:: + ERROR: Could not find a version that satisfies the requirement openvino>=2024.5.0 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0, 2024.1.0, 2024.2.0, 2024.3.0, 2024.4.0, 2024.4.1.dev20240926) + ERROR: No matching distribution found for openvino>=2024.5.0 + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -130,7 +141,7 @@ Prerequisites .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM Load PyTorch model @@ -203,6 +214,13 @@ build PyTorch model pt_model.eval(); + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:303: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + state_dict = torch.load(f, map_location="cpu") + + Run PyTorch model inference --------------------------- @@ -385,23 +403,23 @@ disk using ``openvino.save_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:220: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:220: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:241: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:241: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:163: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:163: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! size = int(math.sqrt(xy_num)) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert size * size == xy_num - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if size != h or size != w: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:251: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam_encoder.py:251: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert x.shape[2] == num_patches - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if num_pts > self.decoder_max_num_input_points: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:92: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:92: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! elif num_pts < self.decoder_max_num_input_points: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:126: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam/EfficientSAM/efficient_sam/efficient_sam.py:126: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if output_w > 0 and output_h > 0: @@ -648,10 +666,10 @@ architecture type, we should specify ``transformer`` in ``model_type``. .. parsed-literal:: - 2024-11-22 00:51:57.265752: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 00:51:57.297997: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 01:35:21.740526: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:35:21.772231: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 00:51:57.938257: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:35:22.412391: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -818,7 +836,7 @@ models, we use ``bencmark_app``. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 29.71 ms + [ INFO ] Read model took 29.92 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] batched_images (node: batched_images) : f32 / [...] / [?,?,?,?] @@ -838,7 +856,7 @@ models, we use ``bencmark_app``. [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_3) : f32 / [...] / [?,?,?,?,?] [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_2) : f32 / [...] / [?,?,?] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1398.31 ms + [ INFO ] Compile model took 1396.28 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -879,17 +897,17 @@ models, we use ``bencmark_app``. [ INFO ] Fill input 'batched_point_labels' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) [ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop). - [ INFO ] First inference took 793.15 ms + [ INFO ] First inference took 850.98 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 55 iterations - [ INFO ] Duration: 17124.15 ms + [ INFO ] Count: 49 iterations + [ INFO ] Duration: 16117.80 ms [ INFO ] Latency: - [ INFO ] Median: 1829.15 ms - [ INFO ] Average: 1806.67 ms - [ INFO ] Min: 872.57 ms - [ INFO ] Max: 2037.03 ms - [ INFO ] Throughput: 3.21 FPS + [ INFO ] Median: 1890.12 ms + [ INFO ] Average: 1899.68 ms + [ INFO ] Min: 1013.52 ms + [ INFO ] Max: 2315.56 ms + [ INFO ] Throughput: 3.04 FPS .. code:: ipython3 @@ -915,7 +933,7 @@ models, we use ``bencmark_app``. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 43.85 ms + [ INFO ] Read model took 43.16 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] batched_images (node: batched_images) : f32 / [...] / [?,?,?,?] @@ -935,7 +953,7 @@ models, we use ``bencmark_app``. [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_3) : f32 / [...] / [?,?,?,?,?] [ INFO ] ***NO_NAME*** (node: aten::reshape/Reshape_2) : f32 / [...] / [?,?,?] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1631.76 ms + [ INFO ] Compile model took 1639.65 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -976,17 +994,17 @@ models, we use ``bencmark_app``. [ INFO ] Fill input 'batched_point_labels' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) [ INFO ] Benchmarking in full mode (inputs filling are included in measurement loop). - [ INFO ] First inference took 583.55 ms + [ INFO ] First inference took 586.73 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 56 iterations - [ INFO ] Duration: 16266.69 ms + [ INFO ] Count: 55 iterations + [ INFO ] Duration: 15880.08 ms [ INFO ] Latency: - [ INFO ] Median: 1710.59 ms - [ INFO ] Average: 1692.97 ms - [ INFO ] Min: 713.08 ms - [ INFO ] Max: 1952.47 ms - [ INFO ] Throughput: 3.44 FPS + [ INFO ] Median: 1710.19 ms + [ INFO ] Average: 1694.56 ms + [ INFO ] Min: 569.82 ms + [ INFO ] Max: 1827.81 ms + [ INFO ] Throughput: 3.46 FPS Interactive segmentation demo @@ -1316,7 +1334,7 @@ Interactive segmentation demo .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/efficient-sam Running on local URL: http://127.0.0.1:7860 To create a public link, set `share=True` in `launch()`. diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png index f9dfb53e3b8796..ee488196e09a35 100644 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png +++ b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_17_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cffb9233e156bb558299a8c9bd3931dad6999f9bf7f358b208549949411460d1 -size 1259114 +oid sha256:c724c8a2e1ea229d28fc4828d1e0f8e3709b56e66b4568cd5c300123a6b6990b +size 1259642 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png index 108e6e0e4564e0..25a70458403cd0 100644 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png +++ b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_25_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5760726cd720e435c5d3a85315e772a741d583553996d8cfe7833f5d941e79f3 -size 1260778 +oid sha256:8086bb37d6a8400d681ce701a0ccd8aca10ef94cbb1d2fd387ae08f06e26342a +size 1262788 diff --git a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png index c767ab3d6193bd..cb5a9e6e89c825 100644 --- a/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png +++ b/docs/notebooks/efficient-sam-with-output_files/efficient-sam-with-output_36_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3909739937c5c50e2b26b3cba0b8b30e98e13fee3eab6c4f382735ec82ae9250 -size 1261525 +oid sha256:a18bb4842ab402d752631d693ed64876b58b8cd3cff35bbb3342ba67b35f2c30 +size 1260902 diff --git a/docs/notebooks/encodec-audio-compression-with-output.rst b/docs/notebooks/encodec-audio-compression-with-output.rst index 4cf2479f638656..4d10def61a4a57 100644 --- a/docs/notebooks/encodec-audio-compression-with-output.rst +++ b/docs/notebooks/encodec-audio-compression-with-output.rst @@ -72,8 +72,6 @@ Install required dependencies: .. parsed-literal:: - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - torchvision 0.17.2+cpu requires torch==2.2.2, but you have torch 2.4.1+cpu which is incompatible. Note: you may need to restart the kernel to use updated packages. @@ -142,7 +140,7 @@ bandwidth. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. WeightNorm.apply(module, name, dim) @@ -302,7 +300,7 @@ similar as possible to the original. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. WeightNorm.apply(module, name, dim) @@ -402,13 +400,13 @@ with ``ov.save_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:60: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:60: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:85: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:87: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:87: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! max_pad = max(padding_left, padding_right) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:89: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:89: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if length <= max_pad: @@ -428,11 +426,11 @@ with ``ov.save_model``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:358: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:358: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. quantized_out = torch.tensor(0.0, device=q_indices.device) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:359: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/quantization/core_vq.py:359: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results). for i, indices in enumerate(q_indices): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/encodec/modules/conv.py:103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert (padding_left + padding_right) <= x.shape[-1] diff --git a/docs/notebooks/fast-segment-anything-with-output.rst b/docs/notebooks/fast-segment-anything-with-output.rst index 9becf2719559bc..0071e2dca60e74 100644 --- a/docs/notebooks/fast-segment-anything-with-output.rst +++ b/docs/notebooks/fast-segment-anything-with-output.rst @@ -85,8 +85,6 @@ Install requirements .. parsed-literal:: - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - torchaudio 2.4.1+cpu requires torch==2.4.1, but you have torch 2.2.2+cpu which is incompatible. Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -158,7 +156,9 @@ model and generate a segmentation map. .. parsed-literal:: - 100%|██████████| 138M/138M [00:02<00:00, 48.9MB/s] + 100%|██████████| 138M/138M [00:03<00:00, 46.3MB/s] + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/ultralytics/nn/tasks.py:732: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + ckpt = torch.load(file, map_location="cpu") @@ -170,8 +170,8 @@ model and generate a segmentation map. .. parsed-literal:: - image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 768x1024 37 objects, 642.9ms - Speed: 3.9ms preprocess, 642.9ms inference, 771.9ms postprocess per image at shape (1, 3, 768, 1024) + image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 768x1024 37 objects, 638.3ms + Speed: 3.4ms preprocess, 638.3ms inference, 500.4ms postprocess per image at shape (1, 3, 768, 1024) The model returns segmentation maps for all the objects on the image. @@ -209,15 +209,15 @@ tracing. The FastSAM model itself is based on YOLOv8 model. .. parsed-literal:: - Ultralytics YOLOv8.2.24 🚀 Python-3.8.10 torch-2.2.2+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) + Ultralytics YOLOv8.2.24 🚀 Python-3.8.10 torch-2.4.1+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) PyTorch: starting from 'FastSAM-x.pt' with input shape (1, 3, 1024, 1024) BCHW and output shape(s) ((1, 37, 21504), (1, 32, 256, 256)) (138.3 MB) OpenVINO: starting export with openvino 2024.4.0-16579-c3152d32c9c-releases/2024/4... - OpenVINO: export success ✅ 6.1s, saved as 'FastSAM-x_openvino_model/' (276.1 MB) + OpenVINO: export success ✅ 6.2s, saved as 'FastSAM-x_openvino_model/' (276.1 MB) - Export complete (9.1s) - Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything + Export complete (9.2s) + Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything Predict: yolo predict task=segment model=FastSAM-x_openvino_model imgsz=1024 Validate: yolo val task=segment model=FastSAM-x_openvino_model imgsz=1024 data=ultralytics/datasets/sa.yaml Visualize: https://netron.app @@ -321,8 +321,8 @@ pipeline. .. parsed-literal:: - image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 1024x1024 42 objects, 494.2ms - Speed: 6.6ms preprocess, 494.2ms inference, 30.3ms postprocess per image at shape (1, 3, 1024, 1024) + image 1/1 /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/fast-segment-anything/coco_bike.jpg: 1024x1024 42 objects, 498.5ms + Speed: 6.1ms preprocess, 498.5ms inference, 31.6ms postprocess per image at shape (1, 3, 1024, 1024) One can observe the converted model outputs in the next cell, they is @@ -521,6 +521,11 @@ repo <-with-output.html>`__. preset=nncf.QuantizationPreset.MIXED) +.. parsed-literal:: + + :7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console) + + .. parsed-literal:: INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino @@ -615,8 +620,8 @@ calibration dataset to measure the performance. .. parsed-literal:: - Segmented in 72 seconds. - Resulting in 1.78 fps + Segmented in 68 seconds. + Resulting in 1.88 fps .. code:: ipython3 @@ -643,9 +648,9 @@ calibration dataset to measure the performance. .. parsed-literal:: - Segmented in 23 seconds - Resulting in 5.57 fps - That is 3.13 times faster! + Segmented in 21 seconds + Resulting in 6.1 fps + That is 3.24 times faster! Try out the converted pipeline diff --git a/docs/notebooks/florence2-with-output.rst b/docs/notebooks/florence2-with-output.rst index e4ab6fbcbd3a3b..7ec9ce6e6557ca 100644 --- a/docs/notebooks/florence2-with-output.rst +++ b/docs/notebooks/florence2-with-output.rst @@ -100,10 +100,10 @@ available model. By default, we will use .. parsed-literal:: - 2024-11-22 01:05:34.426758: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:05:34.462006: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 01:48:13.363088: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:48:13.396921: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 01:05:35.115966: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:48:14.055295: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -193,31 +193,31 @@ pipeline. .. parsed-literal:: - config.json: 0%| | 0.00/2.43k [00:00 1 or self.sliding_window is not None: /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/chkpt/modeling_florence2.py:1205: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False diff --git a/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png b/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png index c233468fe95f4e..0ffc56ebd94d65 100644 --- a/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png +++ b/docs/notebooks/florence2-with-output_files/florence2-with-output_18_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d15ed97d6e50919caff2aee785bc4c90f91dcfcc9bb248f70e9d79bb203be64f -size 259663 +oid sha256:552934f1e05cf6d598ce249bb662530388c1f3335dc2a6af6c304825c8aa023a +size 259656 diff --git a/docs/notebooks/freevc-voice-conversion-with-output.rst b/docs/notebooks/freevc-voice-conversion-with-output.rst index eb1dffbcf5da08..69a935f4c4f78d 100644 --- a/docs/notebooks/freevc-voice-conversion-with-output.rst +++ b/docs/notebooks/freevc-voice-conversion-with-output.rst @@ -133,8 +133,8 @@ Install extra requirements Downloading... From: https://drive.google.com/uc?id=12-cB34qCTvByWT-QtOcZaqwwO21FLSqU&confirm=t&uuid=a703c43c-ccce-436c-8799-c11b88e9e7e4 - To: /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/WavLM-Large.pt - 100%|██████████| 1.26G/1.26G [00:26<00:00, 47.5MB/s] + To: /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/freevc-voice-conversion/FreeVC/wavlm/WavLM-Large.pt + 100%|██████████| 1.26G/1.26G [01:03<00:00, 19.9MB/s] .. code:: ipython3 @@ -153,7 +153,7 @@ Install extra requirements .. parsed-literal:: - checkpoints/freevc.pth: 0%| | 0.00/451M [00:00 - + Your browser does not support the audio element. diff --git a/docs/notebooks/glm-edge-v-with-output.rst b/docs/notebooks/glm-edge-v-with-output.rst new file mode 100644 index 00000000000000..2449d414d82594 --- /dev/null +++ b/docs/notebooks/glm-edge-v-with-output.rst @@ -0,0 +1,516 @@ +Visual-language assistant with GLM-Edge-V and OpenVINO +------------------------------------------------------ + +The +`GLM-Edge `__ +series is `Zhipu `__\ ’s attempt to meet +real-world deployment scenarios for edge devices. It consists of two +sizes of large language dialogue models and multimodal understanding +models (GLM-Edge-1.5B-Chat, GLM-Edge-4B-Chat, GLM-Edge-V-2B, +GLM-Edge-V-5B). Among them, the 1.5B / 2B models are mainly targeted at +platforms like mobile phones and car machines, while the 4B / 5B models +are aimed at platforms like PCs. Based on the technological advancements +of the GLM-4 series, some targeted adjustments have been made to the +model structure and size, balancing model performance, real-world +inference efficiency, and deployment convenience. Through deep +collaboration with partner enterprises and relentless efforts in +inference optimization, the GLM-Edge series models can run at extremely +high speeds on some edge platforms. + +In this tutorial we consider how to launch multimodal model GLM-Edge-V +using OpenVINO for creation multimodal chatbot. Additionally, we +optimize model to low precision using +`NNCF `__ + +**Table of contents:** + +- `Prerequisites <#prerequisites>`__ +- `Select Model <#select-model>`__ +- `Convert and Optimize model <#convert-and-optimize-model>`__ + + - `Compress model weights to + 4-bit <#compress-model-weights-to-4-bit>`__ + +- `Select inference device <#select-inference-device>`__ +- `Run OpenVINO model <#run-openvino-model>`__ +- `Interactive demo <#interactive-demo>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +install required packages and setup helper functions. + +.. code:: ipython3 + + %pip install -q "torch>=2.1" "torchvision" "protobuf>=3.20" "gradio>=4.26" "Pillow" "accelerate" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "openvino>=2024.5.0" "nncf>=2.14.0" + + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + ERROR: Could not find a version that satisfies the requirement openvino>=2024.5.0 (from versions: 2021.3.0, 2021.4.0, 2021.4.1, 2021.4.2, 2022.1.0, 2022.2.0, 2022.3.0, 2022.3.1, 2022.3.2, 2023.0.0.dev20230119, 2023.0.0.dev20230217, 2023.0.0.dev20230407, 2023.0.0.dev20230427, 2023.0.0, 2023.0.1, 2023.0.2, 2023.1.0.dev20230623, 2023.1.0.dev20230728, 2023.1.0.dev20230811, 2023.1.0, 2023.2.0.dev20230922, 2023.2.0, 2023.3.0, 2024.0.0, 2024.1.0, 2024.2.0, 2024.3.0, 2024.4.0, 2024.4.1.dev20240926) + ERROR: No matching distribution found for openvino>=2024.5.0 + Note: you may need to restart the kernel to use updated packages. + + +.. code:: ipython3 + + %pip install -q "git+https://github.com/huggingface/transformers" + + +.. parsed-literal:: + + error: subprocess-exited-with-error + + × Preparing metadata (pyproject.toml) did not run successfully. + │ exit code: 1 + ╰─> [6 lines of output] + + Cargo, the Rust package manager, is not installed or is not on PATH. + This package requires Rust and Cargo to compile extensions. Install it through + the system's package manager or via https://rustup.rs/ + + Checking for Rust toolchain.... + [end of output] + + note: This error originates from a subprocess, and is likely not a problem with pip. + error: metadata-generation-failed + + × Encountered error while generating package metadata. + ╰─> See above for output. + + note: This is an issue with the package mentioned above, not pip. + hint: See above for details. + Note: you may need to restart the kernel to use updated packages. + + +.. code:: ipython3 + + import requests + from pathlib import Path + + if not Path("glmv_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/glm-edge-v/glmv_helper.py") + open("glmv_helper.py", "w").write(r.text) + + + if not Path("gradio_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/glm-edge-v/gradio_helper.py") + open("gradio_helper.py", "w").write(r.text) + + if not Path("notebook_utils.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") + open("notebook_utils.py", "w").write(r.text) + +Select Model +------------ + + + +The tutorial supports the following models from GLM-Edge-V model family: + +- `glm-edge-v-2b `__ +- `glm-edge-v-5b `__ + +You can select one from the provided options below. + +.. code:: ipython3 + + import ipywidgets as widgets + + # Select model + model_ids = [ + "THUDM/glm-edge-v-2b", + "THUDM/glm-edge-v-5b", + ] + + model_dropdown = widgets.Dropdown( + options=model_ids, + value=model_ids[0], + description="Model:", + disabled=False, + ) + + model_dropdown + + + + +.. parsed-literal:: + + Dropdown(description='Model:', options=('THUDM/glm-edge-v-2b', 'THUDM/glm-edge-v-5b'), value='THUDM/glm-edge-v… + + + +Convert and Optimize model +-------------------------- + + + +GLM-Edge-V is PyTorch model. OpenVINO supports PyTorch models via +conversion to OpenVINO Intermediate Representation (IR). `OpenVINO model +conversion +API `__ +should be used for these purposes. ``ov.convert_model`` function accepts +original PyTorch model instance and example input for tracing and +returns ``ov.Model`` representing this model in OpenVINO framework. +Converted model can be used for saving on disk using ``ov.save_model`` +function or directly loading on device using ``core.complie_model``. + +The script ``glmv_helper.py`` contains helper function for model +conversion, please check its content if you interested in conversion +details. + +.. raw:: html + +
+ +Click here for more detailed explanation of conversion steps GLM-Edge-V +is autoregressive transformer generative model, it means that each next +model step depends from model output from previous step. The generation +approach is based on the assumption that the probability distribution of +a word sequence can be decomposed into the product of conditional next +word distributions. In other words, model predicts the next token in the +loop guided by previously generated tokens until the stop-condition will +be not reached (generated sequence of maximum length or end of string +token obtained). The way the next token will be selected over predicted +probabilities is driven by the selected decoding methodology. You can +find more information about the most popular decoding methods in this +blog. The entry point for the generation process for models from the +Hugging Face Transformers library is the ``generate`` method. You can +find more information about its parameters and configuration in the +documentation. To preserve flexibility in the selection decoding +methodology, we will convert only model inference for one step. + +GLM-Edge-V model consists of 3 parts: + +- **Vision Model** for encoding input images into embedding space. +- **Embedding Model** for conversion input text tokens into embedding + space +- **Language Model** for generation answer based on input embeddings + provided by Image Encoder and Input Embedding models. + +.. raw:: html + +
+ +Compress model weights to 4-bit +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For reducing memory +consumption, weights compression optimization can be applied using +`NNCF `__. + +.. raw:: html + +
+ +Click here for more details about weight compression Weight compression +aims to reduce the memory footprint of a model. It can also lead to +significant performance improvement for large memory-bound models, such +as Large Language Models (LLMs). LLMs and other models, which require +extensive memory to store the weights during inference, can benefit from +weight compression in the following ways: + +- enabling the inference of exceptionally large models that cannot be + accommodated in the memory of the device; + +- improving the inference performance of the models by reducing the + latency of the memory access when computing the operations with + weights, for example, Linear layers. + +`Neural Network Compression Framework +(NNCF) `__ provides 4-bit / +8-bit mixed weight quantization as a compression method primarily +designed to optimize LLMs. The main difference between weights +compression and full model quantization (post-training quantization) is +that activations remain floating-point in the case of weights +compression which leads to a better accuracy. Weight compression for +LLMs provides a solid inference performance improvement which is on par +with the performance of the full model quantization. In addition, weight +compression is data-free and does not require a calibration dataset, +making it easy to use. + +``nncf.compress_weights`` function can be used for performing weights +compression. The function accepts an OpenVINO model and other +compression parameters. Compared to INT8 compression, INT4 compression +improves performance even more, but introduces a minor drop in +prediction quality. + +More details about weights compression, can be found in `OpenVINO +documentation `__. + +.. raw:: html + +
+ +.. code:: ipython3 + + from pathlib import Path + import nncf + from glmv_helper import convert_glmv_model + + + model_id = model_dropdown.value + out_dir = Path("model") / Path(model_id).name / "INT4" + compression_configuration = { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "group_size": 64, + "ratio": 0.6, + } + convert_glmv_model(model_id, out_dir, compression_configuration) + + +.. parsed-literal:: + + INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino + + +.. parsed-literal:: + + 2024-12-10 01:51:54.756921: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:51:54.790860: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + 2024-12-10 01:51:55.339388: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + + +.. parsed-literal:: + + ⌛ glm-edge-v-2b conversion started. Be patient, it may takes some time. + ⌛ Load Original model + ✅ Original model successfully loaded + ⌛ Convert Input embedding model + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + +.. parsed-literal:: + + [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + warnings.warn( + `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. + + +.. parsed-literal:: + + ✅ Input embedding model successfully converted + ⌛ Convert Image embedding model + + +.. parsed-literal:: + + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/siglip.py:48: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + grid_size = int(s**0.5) + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/siglip.py:53: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + image_emb = torch.cat([self.boi.repeat(len(image_emb), 1, 1), image_emb, self.eoi.repeat(len(image_emb), 1, 1)], dim=1) + + +.. parsed-literal:: + + ✅ Image embedding model successfully converted + ⌛ Convert Language model + + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + or len(self.key_cache[layer_idx]) == 0 # the layer has no cache + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/modeling_glm.py:995: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if sequence_length != 1: + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/modeling_glm.py:153: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + rotary_dim = int(q.shape[-1] * partial_rotary_factor) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors + /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/THUDM/glm-edge-v-2b/30c2bc691c9d46433abfd450e04441458d503f34/modeling_glm.py:249: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:168: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) + if a.grad is not None: + + +.. parsed-literal:: + + ✅ Language model successfully converted + ⌛ Weights compression with int4_sym mode started + + + +.. parsed-literal:: + + Output() + + + + + + + + + +.. parsed-literal:: + + INFO:nncf:Statistics of the bitwidth distribution: + ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ + │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ + ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ + │ 8 │ 45% (115 / 169) │ 40% (114 / 168) │ + ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ + │ 4 │ 55% (54 / 169) │ 60% (54 / 168) │ + ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ + + + +.. parsed-literal:: + + Output() + + + + + + + + + +.. parsed-literal:: + + ✅ Weights compression finished + ✅ glm-edge-v-2b model conversion finished. You can find results in model/glm-edge-v-2b/INT4 + + +Select inference device +----------------------- + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget(default="AUTO", exclude=["NPU"]) + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') + + + +Run OpenVINO model +------------------ + + + +``OvGLMv`` class provides convenient way for running model. It accepts +directory with converted model and inference device as arguments. For +running model we will use ``generate`` method. + +.. code:: ipython3 + + from glmv_helper import OvGLMv + + model = OvGLMv(out_dir, device.value) + +.. code:: ipython3 + + import requests + from PIL import Image + + url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11" + image = Image.open(requests.get(url, stream=True).raw) + + query = "Please describe this picture" + + print(f"Question:\n {query}") + image + + +.. parsed-literal:: + + Question: + Please describe this picture + + + + +.. image:: glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png + + + +.. code:: ipython3 + + from transformers import TextStreamer, AutoImageProcessor, AutoTokenizer + import torch + + messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}] + + processor = AutoImageProcessor.from_pretrained(out_dir, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(out_dir, trust_remote_code=True) + inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_dict=True, tokenize=True, return_tensors="pt").to("cpu") + generate_kwargs = { + **inputs, + "pixel_values": torch.tensor(processor(image).pixel_values).to("cpu"), + "max_new_tokens": 100, + "do_sample": True, + "top_k": 20, + "streamer": TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True), + } + + print("Answer:") + output = model.generate(**generate_kwargs) + + +.. parsed-literal:: + + Answer: + The image depicts a cat resting inside a cardboard box placed on a soft carpeted floor. The cat is lying with its head towards the bottom of the box, and its front paws are stretched out with the right one slightly forward, while its back and hind legs are positioned in the box. The box appears to be in partial disassembly, with the flaps folded down and one side raised slightly off the ground. The cat's fur is well-groomed and + + +Interactive demo +---------------- + + + +.. code:: ipython3 + + from gradio_helper import make_demo + + demo = make_demo(model, processor, tokenizer) + + try: + demo.launch(debug=False, height=600) + except Exception: + demo.launch(debug=False, share=True, height=600) + # if you are launching remotely, specify server_name and server_port + # demo.launch(server_name='your server name', server_port='server port in int') + # Read more in the docs: https://gradio.app/docs/ + + +.. parsed-literal:: + + Running on local URL: http://127.0.0.1:7860 + + To create a public link, set `share=True` in `launch()`. + + + + + + + diff --git a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.jpg b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.jpg new file mode 100644 index 00000000000000..c6aeec77cd3cb2 --- /dev/null +++ b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fc0d22d75f23474fb4f8aec8c0bf0fdf5d9377f3379e82a3887003e6da47e7e +size 60425 diff --git a/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png new file mode 100644 index 00000000000000..c6673a757ab5dc --- /dev/null +++ b/docs/notebooks/glm-edge-v-with-output_files/glm-edge-v-with-output_13_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c715d8adee4bf7519690de20b57ef2edaa2f914c86a64d107f99a919dcdad218 +size 854224 diff --git a/docs/notebooks/grounded-segment-anything-with-output.rst b/docs/notebooks/grounded-segment-anything-with-output.rst index a51ce8249239f9..6449fb1a6a9507 100644 --- a/docs/notebooks/grounded-segment-anything-with-output.rst +++ b/docs/notebooks/grounded-segment-anything-with-output.rst @@ -201,7 +201,7 @@ Download checkpoints and load PyTorch models .. parsed-literal:: - checkpoints/groundingdino_swint_ogc.pth: 0%| | 0.00/662M [00:00 + @@ -215,7 +215,7 @@ Do Inference .. parsed-literal:: - + diff --git a/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_1.png b/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_1.png new file mode 100644 index 00000000000000..3677caabff4380 --- /dev/null +++ b/docs/notebooks/hello-segmentation-with-output_files/hello-segmentation-with-output_11_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76113c575caa9c8a8aca45d3ec6ebd7a4b513dadffd8e9e63861a7a041d7e5de +size 249032 diff --git a/docs/notebooks/hello-world-with-output.rst b/docs/notebooks/hello-world-with-output.rst index 5bd1216db29701..94d6dca5798876 100644 --- a/docs/notebooks/hello-world-with-output.rst +++ b/docs/notebooks/hello-world-with-output.rst @@ -98,13 +98,13 @@ Download the Model and data samples .. parsed-literal:: - artifacts/v3-small_224_1.0_float.xml: 0%| | 0.00/294k [00:00=4.33.0" "torch>=2.1.0" %pip install -q ipywidgets - %pip install -q "openvino>=2023.1.0" + %pip install -q "openvino>=2023.1.0" "Pillow" .. parsed-literal:: @@ -132,10 +132,10 @@ tutorials `__. from optimum.intel.openvino import OVModelForSequenceClassification - -.. parsed-literal:: - - huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... - To disable this warning, you can either: - - Avoid using `tokenizers` before the fork if possible - - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) - - Initialize and Convert the Model Automatically using OVModel class ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -436,7 +427,7 @@ Full list of supported arguments available via ``--help`` .. parsed-literal:: - 2024-11-22 01:15:03.858078: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:57:20.152345: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code] [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] @@ -445,8 +436,10 @@ Full list of supported arguments available via ``--help`` [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] [--group-size GROUP_SIZE] + [--backup-precision {none,int8_sym,int8_asym}] [--dataset DATASET] [--all-layers] [--awq] [--scale-estimation] [--gptq] + [--lora-correction] [--sensitivity-metric SENSITIVITY_METRIC] [--num-samples NUM_SAMPLES] [--disable-stateful] @@ -467,20 +460,20 @@ Full list of supported arguments available via ``--help`` --task TASK The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among: - ['audio-xvector', 'image-text-to-text', 'mask- - generation', 'text-generation', 'masked-im', 'image- - classification', 'token-classification', 'question- - answering', 'automatic-speech-recognition', 'multiple- - choice', 'image-segmentation', 'semantic- - segmentation', 'text2text-generation', 'feature- - extraction', 'image-to-text', 'text-to-audio', 'text- - to-image', 'zero-shot-object-detection', 'inpainting', - 'zero-shot-image-classification', 'object-detection', - 'text-classification', 'image-to-image', 'sentence- - similarity', 'audio-frame-classification', 'depth- - estimation', 'audio-classification', 'fill-mask']. For - decoder models, use `xxx-with-past` to export the - model using past key values in the decoder. + ['zero-shot-object-detection', 'multiple-choice', + 'audio-xvector', 'masked-im', 'text2text-generation', + 'inpainting', 'image-segmentation', 'semantic- + segmentation', 'question-answering', 'token- + classification', 'audio-frame-classification', + 'feature-extraction', 'text-to-audio', 'image-to- + image', 'fill-mask', 'automatic-speech-recognition', + 'image-classification', 'text-classification', 'zero- + shot-image-classification', 'object-detection', + 'image-to-text', 'audio-classification', 'sentence- + similarity', 'depth-estimation', 'text-to-image', + 'mask-generation', 'text-generation']. For decoder + models, use `xxx-with-past` to export the model using + past key values in the decoder. --framework {pt,tf} The framework to use for the export. If not provided, will attempt to use the local checkpoint's original framework or what is available in the environment. @@ -514,12 +507,27 @@ Full list of supported arguments available via ``--help`` --group-size GROUP_SIZE The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. + --backup-precision {none,int8_sym,int8_asym} + Defines a backup precision for mixed-precision weight + compression. Only valid for int4 weight format. If not + provided, backup precision is int8_asym. 'none' stands + for original floating-point precision of the model + weights, in this case weights are retained in their + original precision without any quantization. + 'int8_sym' stands for 8-bit integer symmetric + quantization without zero point. 'int8_asym' stands + for 8-bit integer asymmetric quantization with zero + points per each quantization group. --dataset DATASET The dataset used for data-aware compression or - quantization with NNCF. You can use the one from the - list ['wikitext2','c4','c4-new'] for language models - or ['conceptual_captions','laion/220k-GPT4Vision- - captions-from-LIVIS','laion/filtered-wit'] for - diffusion models. + quantization with NNCF. For language models you can + use the one from the list + ['auto','wikitext2','c4','c4-new']. With 'auto' the + dataset will be collected from model's generations. + For diffusion models it should be on of + ['conceptual_captions','laion/220k-GPT4Vision- + captions-from-LIVIS','laion/filtered-wit']. For visual + language models the dataset must be set to + 'contextual'. --all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight compression is applied, they are compressed to INT8. @@ -527,7 +535,7 @@ Full list of supported arguments available via ``--help`` generation quality of INT4-compressed LLMs, but requires additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a - dataset argument. Note: it's possible that there will + dataset argument. Note: it is possible that there will be no matching patterns in the model to apply AWQ, in such case it will be skipped. --scale-estimation Indicates whether to apply a scale estimation @@ -541,9 +549,15 @@ Full list of supported arguments available via ``--help`` to minimize the difference between activations of a compressed and original layer. Please note, that applying GPTQ takes additional memory and time. + --lora-correction Indicates whether to apply LoRA Correction algorithm. + When enabled, this algorithm introduces low-rank + adaptation layers in the model that can recover + accuracy after weight compression at some cost of + inference latency. Please note, that applying LoRA + Correction algorithm takes additional memory and time. --sensitivity-metric SENSITIVITY_METRIC The sensitivity metric for assigning quantization - precision to layers. Can be one of the following: + precision to layers. It can be one of the following: ['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', 'max_activation_variance', @@ -561,7 +575,7 @@ Full list of supported arguments available via ``--help`` performance. Use it when you intentionally want to use a stateless model, for example, to be compatible with existing OpenVINO native inference code that expects - kv-cache inputs and outputs in the model. + KV-cache inputs and outputs in the model. --disable-convert-tokenizer Do not add converted tokenizer and detokenizer OpenVINO models. @@ -585,7 +599,7 @@ compression: .. parsed-literal:: - 2024-11-22 01:15:09.417610: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:57:25.755800: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight'] - This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). @@ -636,9 +650,8 @@ OpenVINO `__ 3. `Stable Diffusion v2.1 using Optimum-Intel OpenVINO `__ 4. `Image generation with Stable Diffusion -XL `__ 5. `Instruction following using -Databricks Dolly 2.0 `__ 6. `Create -LLM-powered Chatbot using OpenVINO `__ 7. `Document -Visual Question Answering Using Pix2Struct and -OpenVINO `__ 8. `Automatic speech recognition -using Distil-Whisper and OpenVINO `__ +XL `__ 5. `Create LLM-powered Chatbot using +OpenVINO `__ 6. `Document Visual Question Answering +Using Pix2Struct and OpenVINO `__ 7. `Automatic +speech recognition using Distil-Whisper and +OpenVINO `__ diff --git a/docs/notebooks/hunyuan-dit-image-generation-with-output.rst b/docs/notebooks/hunyuan-dit-image-generation-with-output.rst index 01b20ab650824e..61c412fe6f5e62 100644 --- a/docs/notebooks/hunyuan-dit-image-generation-with-output.rst +++ b/docs/notebooks/hunyuan-dit-image-generation-with-output.rst @@ -36,6 +36,7 @@ using OpenVINO. Additionally, we will use `NNCF `__ for optimizing model in low precision. + **Table of contents:** - `Prerequisites <#prerequisites>`__ diff --git a/docs/notebooks/image-classification-quantization-with-output.rst b/docs/notebooks/image-classification-quantization-with-output.rst index 491ca0eed2881a..177ffd97209a57 100644 --- a/docs/notebooks/image-classification-quantization-with-output.rst +++ b/docs/notebooks/image-classification-quantization-with-output.rst @@ -194,7 +194,7 @@ Preprocessing for model obtained from training .. parsed-literal:: - 100%|██████████| 170498071/170498071 [00:07<00:00, 23705445.93it/s] + 100%|██████████| 170498071/170498071 [00:07<00:00, 22538385.96it/s] .. parsed-literal:: @@ -266,10 +266,10 @@ about supported parameters can be found on this .. parsed-literal:: - 2024-11-22 01:15:46.610115: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:15:46.641664: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 01:58:02.605724: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 01:58:02.638370: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-11-22 01:15:47.181563: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT + 2024-12-10 01:58:03.190744: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT @@ -431,7 +431,7 @@ Tool `__ to speed up pipeline. + **Table of contents:** - `Prerequisites <#prerequisites>`__ @@ -82,7 +83,6 @@ pipeline. pipelines <#compare-inference-time-of-the-fp16-and-int8-pipelines>`__ - `Interactive demo <#interactive-demo>`__ - Installation Instructions ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/notebooks/janus-multimodal-generation-with-output.rst b/docs/notebooks/janus-multimodal-generation-with-output.rst new file mode 100644 index 00000000000000..a8a5cc599699c6 --- /dev/null +++ b/docs/notebooks/janus-multimodal-generation-with-output.rst @@ -0,0 +1,472 @@ +Multimodal understanding and generation with Janus and OpenVINO +=============================================================== + +Janus is a novel autoregressive framework that unifies multimodal +understanding and generation. It addresses the limitations of previous +approaches by decoupling visual encoding into separate pathways, while +still utilizing a single, unified transformer architecture for +processing. The decoupling not only alleviates the conflict between the +visual encoder’s roles in understanding and generation, but also +enhances the framework’s flexibility. Janus surpasses previous unified +model and matches or exceeds the performance of task-specific models. +The simplicity, high flexibility, and effectiveness of Janus make it a +strong candidate for next-generation unified multimodal models. + +More details can be found in the +`paper `__, original +`repository `__ and `model +card `__ + +In this tutorial we consider how to run and optimize Janus using +OpenVINO. + +**Table of contents:** + +- `Prerequisites <#prerequisites>`__ +- `Convert and Optimize model <#convert-and-optimize-model>`__ + + - `Compress model weights to + 4-bit <#compress-model-weights-to-4-bit>`__ + +- `Create Inference Pipeline <#create-inference-pipeline>`__ + + - `Select Inference Device <#select-inference-device>`__ + - `Run visual language chat <#run-visual-language-chat>`__ + - `Run Image generation <#run-image-generation>`__ + +- `Interactive demo <#interactive-demo>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +.. code:: ipython3 + + from pathlib import Path + import requests + + utility_files = ["notebook_utils.py"] + local_helpers = ["ov_janus_helper.py", "gradio_helper.py"] + + base_utils_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" + base_local_files_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/janus-multimodal-generation/" + + + for util_path in utility_files: + if not Path(util_path).exists(): + r = requests.get(base_utils_url + util_path) + with open(util_path, "w") as f: + f.write(r.text) + + for util_path in local_helpers: + if not Path(util_path).exists(): + r = requests.get(base_local_files_url + util_path) + with open(util_path, "w") as f: + f.write(r.text) + +.. code:: ipython3 + + import platform + + %pip install -q "gradio>=4.19" "torch>=2.2" "torchvision" "safetensors" "transformers>=4.38" "nncf>=2.14" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "git+https://github.com/deepseek-ai/Janus" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -U --pre "openvino>2024.5" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" + +Convert and Optimize model +-------------------------- + + + +Janus is PyTorch model. OpenVINO supports PyTorch models via conversion +to OpenVINO Intermediate Representation (IR). `OpenVINO model conversion +API `__ +should be used for these purposes. ``ov.convert_model`` function accepts +original PyTorch model instance and example input for tracing and +returns ``ov.Model`` representing this model in OpenVINO framework. +Converted model can be used for saving on disk using ``ov.save_model`` +function or directly loading on device using ``core.complie_model``. + +The script ``ov_janus_helper.py`` contains helper function for model +conversion, please check its content if you interested in conversion +details. + +.. raw:: html + +
+ +.. raw:: html + + + +Click here for more detailed explanation of conversion steps + +.. raw:: html + + + +Janus is autoregressive transformer generative model, it means that each +next model step depends from model output from previous step. The +generation approach is based on the assumption that the probability +distribution of a token sequence can be decomposed into the product of +conditional next token distributions. In other words, model predicts the +next token in the loop guided by previously generated tokens until the +stop-condition will be not reached (generated sequence of maximum length +or end of generation token obtained). The way the next token will be +selected over predicted probabilities is driven by the selected decoding +methodology. You can find more information about the most popular +decoding methods in this blog. The entry point for the generation +process for models from the Hugging Face Transformers library is the +``generate`` method. You can find more information about its parameters +and configuration in the documentation. To preserve flexibility in the +selection decoding methodology, we will convert only model inference for +one step. + +For both tasks, image understanding and image generation, Janus utilizes +the same basic transformer architecture in ``language_model`` and change +only components responsible for preparing input embeddings (joined image +embeddings prepared using ``vision_embeddings_model`` and text +embeddings prepared using ``text_embeddings_model`` for image +understanding and ``text_embeddings_model`` on the first step as initial +prompt embeddings and ``gen_embeddings_model`` for the next) and +conversion final hidden state to tokens probabilities (``lm_head`` for +text tokens, ``gen_head`` for image tokens). Additionally, for image +generation model uses ``gen_decoder`` to convert generated image tokens +to images. + +To sum up above, model consists of 7 parts: \* **Image Embeddings** for +encoding input images into embedding space in image understanding task. +\* **Text Embedding** for conversion input text tokens into embedding +space \* **Gen Embeddings** for encoding image generation tokens to +embeddings space in image generation task \* **Language Model** for +generation hidden state guided by input embeddings \* **LM Head** for +conversion Language Model hidden state to text generation token +probabilities \* **Gen Head** for conversion Language Model hidden state +to image generation token probabilities \* **Gen Decoder** for decoding +generated image from latent token space to image tensor space. + +For preserving original model flexibility of switching between tasks, we +also should preserve original model partitioning and convert each model +part separately. + +.. raw:: html + +
+ +Compress model weights to 4-bit +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For reducing memory +consumption, weights compression optimization can be applied using +`NNCF `__. + +.. raw:: html + +
+ +.. raw:: html + + + +Click here for more details about weight compression + +.. raw:: html + + + +Weight compression aims to reduce the memory footprint of a model. It +can also lead to significant performance improvement for large +memory-bound models, such as Large Language Models (LLMs). LLMs and +other models, which require extensive memory to store the weights during +inference, can benefit from weight compression in the following ways: + +- enabling the inference of exceptionally large models that cannot be + accommodated in the memory of the device; + +- improving the inference performance of the models by reducing the + latency of the memory access when computing the operations with + weights, for example, Linear layers. + +`Neural Network Compression Framework +(NNCF) `__ provides 4-bit / +8-bit mixed weight quantization as a compression method primarily +designed to optimize LLMs. The main difference between weights +compression and full model quantization (post-training quantization) is +that activations remain floating-point in the case of weights +compression which leads to a better accuracy. Weight compression for +LLMs provides a solid inference performance improvement which is on par +with the performance of the full model quantization. In addition, weight +compression is data-free and does not require a calibration dataset, +making it easy to use. + +``nncf.compress_weights`` function can be used for performing weights +compression. The function accepts an OpenVINO model and other +compression parameters. Compared to INT8 compression, INT4 compression +improves performance even more, but introduces a minor drop in +prediction quality. + +More details about weights compression, can be found in `OpenVINO +documentation `__. + +.. raw:: html + +
+ +.. code:: ipython3 + + import nncf + from ov_janus_helper import convert_janus_model + + model_id = "deepseek-ai/Janus-1.3B" + model_path = Path(model_id.split("/")[-1] + "-ov") + + compression_configuration = { + "mode": nncf.CompressWeightsMode.INT4_ASYM, + "group_size": 64, + "ratio": 1.0, + } + + # uncomment the line to see model conversion code + # ??convert_janus_model + + +.. parsed-literal:: + + INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino + + +.. parsed-literal:: + + 2024-11-26 20:09:59.629857: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-11-26 20:09:59.643309: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered + WARNING: All log messages before absl::InitializeLog() is called are written to STDERR + E0000 00:00:1732637399.658322 1754417 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered + E0000 00:00:1732637399.662894 1754417 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered + 2024-11-26 20:09:59.679869: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + + +.. parsed-literal:: + + Python version is above 3.10, patching the collections module. + + +.. parsed-literal:: + + /home/ea/work/py311/lib/python3.11/site-packages/transformers/models/auto/image_processing_auto.py:520: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead + warnings.warn( + + +.. code:: ipython3 + + convert_janus_model(model_id, model_path, compression_configuration) + + +.. parsed-literal:: + + ✅ Janus-1.3B model already converted. You can find results in Janus-1.3B-ov + + +Create Inference Pipeline +------------------------- + + + +``OVJanusModel`` defined in ``ov_janus_helper.py`` provides unified +interface for running model inference for both text and image +generation. It accepts model directory and target device for inference. + +Select Inference Device +~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget("CPU", ["NPU"]) + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +.. code:: ipython3 + + from ov_janus_helper import OVJanusModel + from janus.models import VLChatProcessor + + # uncomment the line to see model inference code + + # ??OVJanusModel + +``VLChatPRocessor`` class used for pre- and postprocessing steps in +original Janus model. Our model is also compatible with the same +processor code and we can reuse it. + +.. code:: ipython3 + + ov_model = OVJanusModel(model_path, device.value) + + processor = VLChatProcessor.from_pretrained(model_path) + + +.. parsed-literal:: + + Some kwargs in processor config are unused and will not have any effect: image_end_tag, sft_format, image_tag, num_image_tokens, add_special_token, mask_prompt, ignore_id, image_start_tag. + + +Run visual language chat +~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from PIL import Image + from io import BytesIO + from janus.utils.io import load_pil_images + + + input_prompt = "Describe image in details" + image_path = Path("cat_in_box.png") + + if not image_path.exists(): + response = requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11") + image = Image.open(BytesIO(response.content)).convert("RGB") + image.save(image_path) + + conversation = [ + { + "role": "User", + "content": f"{input_prompt}\n", + "images": [str(image_path)], + }, + {"role": "Assistant", "content": ""}, + ] + pil_images = load_pil_images(conversation) + +.. code:: ipython3 + + from transformers import TextStreamer + + prepare_inputs = processor(conversations=conversation, images=pil_images, force_batchify=True) + # run image encoder to get the image embeddings + inputs_embeds = ov_model.prepare_inputs_embeds(**prepare_inputs) + + streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) + + print(f"Question:\n{input_prompt}") + display(pil_images[0]) + print("Answer:") + + answer_token_ids = ov_model.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=prepare_inputs.attention_mask, + pad_token_id=processor.tokenizer.eos_token_id, + bos_token_id=processor.tokenizer.bos_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + max_new_tokens=128, + do_sample=False, + streamer=streamer, + ) + + +.. parsed-literal:: + + Question: + Describe image in details + + + +.. image:: janus-multimodal-generation-with-output_files/janus-multimodal-generation-with-output_14_1.png + + +.. parsed-literal:: + + Answer: + The image depicts a gray and white tabby cat lying comfortably inside a cardboard box. The cat is lying on its back with its legs and paws spread out in a relaxed manner. The cat's eyes are closed, and it appears to be enjoying a nap. The box is placed on a light-colored carpet, and in the background, there is a portion of a white couch visible. The lighting in the room is soft and natural, suggesting that the photo was taken during the daytime. The overall scene conveys a sense of tranquility and contentment. + + +Run Image generation +~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from ov_janus_helper import generate_image + + # Uncomment the line to see image generation code + # ??generate_image + +.. code:: ipython3 + + from transformers import set_seed + + set_seed(12345) + + images = generate_image( + ov_model, + processor, + "A close-up professional photo of Yorkshire Terrier on beach, extrimely detailed, hyper realistic, full hd", + output_dir=None, + parallel_size=1, + ) + + + +.. parsed-literal:: + + 0%| | 0/576 [00:00`__ is a Python library for +accelerator-oriented array computation and program transformation, +designed for high-performance numerical computing and large-scale +machine learning. JAX provides a familiar NumPy-style API for ease of +adoption by researchers and engineers. + +In this tutorial we will show how to convert JAX +`ViT `__ +and +`Mixer `__ +models in OpenVINO format. + +.. raw:: html + +
+ +.. raw:: html + + + +Click here for more detailed information about the models + +.. raw:: html + + + +Vision Transformer +~~~~~~~~~~~~~~~~~~ + +Overview of the model: authors split an image into fixed-size patches, +linearly embed each of them, add position embeddings, and feed the +resulting sequence of vectors to a standard Transformer encoder. In +order to perform classification, authors use the standard approach of +adding an extra learnable “classification token” to the sequence. + +MLP-Mixer +~~~~~~~~~ + +MLP-Mixer (Mixer for short) consists of per-patch linear embeddings, +Mixer layers, and a classifier head. Mixer layers contain one +token-mixing MLP and one channel-mixing MLP, each consisting of two +fully-connected layers and a GELU nonlinearity. Other components +include: skip-connections, dropout, and linear classifier head. + +.. raw:: html + +
+ + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Load and run the original model and a + sample <#load-and-run-the-original-model-and-a-sample>`__ +- `Convert the model to OpenVINO + IR <#convert-the-model-to-openvino-ir>`__ +- `Compiling the model <#compiling-the-model>`__ +- `Run OpenVINO model inference <#run-openvino-model-inference>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +.. code:: ipython3 + + import requests + + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", + ) + open("cmd_helper.py", "w").write(r.text) + +.. code:: ipython3 + + from cmd_helper import clone_repo + + + clone_repo("https://github.com/google-research/vision_transformer.git") + +.. code:: ipython3 + + %pip install -q "openvino>=2024.5.0" + %pip install -q Pillow "jax>=0.4.2" "absl-py>=0.12.0" "flax>=0.6.4" "pandas>=1.1.0" "tensorflow-cpu>=2.4.0" tf_keras tqdm "einops>=0.3.0" "ml-collections>=0.1.0" + +.. code:: ipython3 + + import PIL + import jax + import numpy as np + + from vit_jax import checkpoint + from vit_jax import models_vit + from vit_jax import models_mixer + from vit_jax.configs import models as models_config + + import openvino as ov + +.. code:: ipython3 + + import ipywidgets as widgets + + available_models = ["ViT-B_32", "Mixer-B_16"] + + + model_to_use = widgets.Select( + options=available_models, + value=available_models[0], + description="Select model:", + disabled=False, + ) + + model_to_use + + + + +.. parsed-literal:: + + Select(description='Select model:', options=('ViT-B_32', 'Mixer-B_16'), value='ViT-B_32') + + + +Load and run the original model and a sample +-------------------------------------------- + + + +Download a pre-trained model. + +.. code:: ipython3 + + from notebook_utils import download_file + + + model_name = model_to_use.value + model_config = models_config.MODEL_CONFIGS[model_name] + + + if model_name.startswith("Mixer"): + # Download model trained on imagenet2012 + model_name_path = download_file(f"https://storage.googleapis.com/mixer_models/imagenet1k/{model_name}.npz", filename=f"{model_name}_imagenet2012.npz") + model = models_mixer.MlpMixer(num_classes=1000, **model_config) + else: + # Download model pre-trained on imagenet21k and fine-tuned on imagenet2012. + model_name_path = download_file( + f"https://storage.googleapis.com/vit_models/imagenet21k+imagenet2012/{model_name}.npz", filename=f"{model_name}_imagenet2012.npz" + ) + model = models_vit.VisionTransformer(num_classes=1000, **model_config) + + + +.. parsed-literal:: + + ViT-B_32_imagenet2012.npz: 0%| | 0.00/337M [00:00`__ +should be used for these purposes. ``ov.convert_model`` function accepts +original JAX model instance and example input for tracing and returns +``ov.Model`` representing this model in OpenVINO framework. Converted +model can be used for saving on disk using ``ov.save_model`` function or +directly loading on device using ``core.complie_model``. + +Before conversion we need to create the +`Jaxprs `__ +(JAX’s internal intermediate representation (IR) of programs) object by +tracing a Python function using the +`jax.make_jaxpr `__ +function. [``jax.make_jaxpr``] take a function as argument, that should +perform the forward pass. In our case it is calling of ``model.apply`` +method. But ``model.apply`` requires not only input data, but also +``params`` and keyword argument ``train=False`` in our case. To handle +it create a wrapper function ``model_apply`` that calls +``model.apply(params, x, train=False)``. + +.. code:: ipython3 + + from pathlib import Path + + + model_path = Path(f"models/{model_name}.xml") + + + def model_apply(x): + return model.apply(dict(params=params), x, train=False) + + + jaxpr = jax.make_jaxpr(model_apply)((np.array(img) / 128 - 1)[None, ...]) + + converted_model = ov.convert_model(jaxpr) + ov.save_model(converted_model, model_path) + +Compiling the model +------------------- + + + +Select device from dropdown list for running inference using OpenVINO. + +.. code:: ipython3 + + from notebook_utils import device_widget + + + core = ov.Core() + + device = device_widget() + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') + + + +.. code:: ipython3 + + compiled_model = core.compile_model(model_path, device.value) + +Run OpenVINO model inference +---------------------------- + +.. code:: ipython3 + + (logits_ov,) = list(compiled_model(data).values())[0] + + preds = np.array(jax.nn.softmax(logits_ov)) + for idx in preds.argsort()[:-11:-1]: + print(f"{preds[idx]:.5f} : {imagenet_labels[idx]}", end="") + + +.. parsed-literal:: + + 0.95255 : alp + 0.03881 : valley, vale + 0.00192 : cliff, drop, drop-off + 0.00173 : ski + 0.00059 : lakeside, lakeshore + 0.00049 : promontory, headland, head, foreland + 0.00036 : volcano + 0.00021 : snowmobile + 0.00017 : mountain_bike, all-terrain_bike, off-roader + 0.00017 : mountain_tent + diff --git a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg new file mode 100644 index 00000000000000..4e389f1fcb75af --- /dev/null +++ b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9ce29fc2d800faa2667de9fc47770370f12c829217c22142bfcd1f5e1a2752 +size 33195 diff --git a/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png new file mode 100644 index 00000000000000..901c02bacbed30 --- /dev/null +++ b/docs/notebooks/jax-classification-to-openvino-with-output_files/jax-classification-to-openvino-with-output_16_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe240660061089dfc38c95d77b074051cc37b794c4d096e5841cf8d575311d9 +size 237944 diff --git a/docs/notebooks/knowledge-graphs-conve-with-output.rst b/docs/notebooks/knowledge-graphs-conve-with-output.rst index aa8b1a20ea554f..4d01d076afd676 100644 --- a/docs/notebooks/knowledge-graphs-conve-with-output.rst +++ b/docs/notebooks/knowledge-graphs-conve-with-output.rst @@ -196,19 +196,19 @@ Settings: Including path to the serialized model files and input data files .. parsed-literal:: - data/kg_training_entids.txt: 0%| | 0.00/3.79k [00:00`__ .. parsed-literal:: - 2024-11-22 01:21:24.800927: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:21:24.825776: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:10:00.149367: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:10:00.174583: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -373,14 +373,14 @@ Vision model accept ``pixel_values`` and returns ``image_embeds``. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:452: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:452: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:519: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:519: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:559: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:559: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): @@ -408,7 +408,7 @@ Convert Image To Text Projection model .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:165: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:168: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at aten/src/ATen/core/TensorBody.h:489.) if a.grad is not None: @@ -543,13 +543,13 @@ generated text by ``AutoProcessor``. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:859: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:859: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if max_pos > self.weights.size(0): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1168: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if input_shape[-1] > 1: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:975: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:975: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attention_mask.size() != (batch_size, 1, seq_length, src_len): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1261: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/kosmos2/modeling_kosmos2.py:1261: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: @@ -1391,9 +1391,9 @@ pipelines, we use mean inference time on 7 samples. .. parsed-literal:: - FP32 pipeline: 2.727 seconds - Optimized pipeline: 1.146 seconds - Performance speed-up: 2.380 + FP32 pipeline: 2.760 seconds + Optimized pipeline: 1.136 seconds + Performance speed-up: 2.430 Interactive inference diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg index c4966e68a0f7c6..8cbf8c6845558b 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d99c65937fed48b5c1ef214891a3ded6fc4acabbad731ecafdf30d897cd8807b -size 121119 +oid sha256:90eb5c813dbef6b48b4d6e6acca89940550e650f29648178615bc5b73cfbad07 +size 123201 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png index 717e205ccbaa23..76747126a0b8a7 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_29_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e416163b28e55e213c884e64462792c0cb5f9ae1389961c3a5467ef2c1ac101 -size 1150960 +oid sha256:2c680f410cf278d774523ad5338a2a1c4a5fe705113306c7abbec065c2108968 +size 1150690 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png index 85633bcfcf04ae..3a29f664a441a1 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_48_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7561941945a717b6a4f6e6bda157e86c62c5ff638acad518558c176a0ba21be5 -size 1149449 +oid sha256:39a74767a21f27ea1076d4d999630d18c019b8de712c05c75fca7ef1a7979199 +size 1148499 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg index 5aed31c2359d29..6586a554fa5fcc 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de647e8e1a39e8ee78c7c90a14f373b972e4f381f3348d6b28d0fe18a912eb51 -size 122484 +oid sha256:18799247eb9a64ea7a8828cd7587fcc1b428cc2d5e300dcf64393ce9bd0e4bc9 +size 124329 diff --git a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png index 5eb34946e278d0..0193662b0a661b 100644 --- a/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png +++ b/docs/notebooks/kosmos2-multimodal-large-language-model-with-output_files/kosmos2-multimodal-large-language-model-with-output_8_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77941b5ac0c4ca3379b3a66eb94aeaa24b8c68e225f6e9369ca1cb262feaab7a -size 1150730 +oid sha256:ea65e060c07381de785e4c03e02fadd599b89d605a00be7e62987cb582d00d97 +size 1150941 diff --git a/docs/notebooks/language-quantize-bert-with-output.rst b/docs/notebooks/language-quantize-bert-with-output.rst index 2ba6bca451ad0b..e9c92052b26bae 100644 --- a/docs/notebooks/language-quantize-bert-with-output.rst +++ b/docs/notebooks/language-quantize-bert-with-output.rst @@ -101,8 +101,8 @@ Imports .. parsed-literal:: - 2024-11-22 01:28:13.948145: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:28:13.973147: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:16:53.582571: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:16:53.608080: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -149,7 +149,7 @@ Perform the following: .. parsed-literal:: - model/MRPC.zip: 0%| | 0.00/387M [00:00=2.1.0" "torchvision" "torchaudio" --index-url https://download.pytorch.org/whl/cpu + %pip install -q "torch>=2.3.0" "torchvision" "torchaudio" --index-url https://download.pytorch.org/whl/cpu %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --index-url https://download.pytorch.org/whl/cpu %pip install -q "nncf>=2.14.0" "sentencepiece" "tokenizers>=0.12.1" "transformers>=4.45.0" "gradio>=4.36" %pip install -q -U "openvino-tokenizers>=2024.5.0" "openvino>=2024.5.0" "openvino-genai>=2024.5.0"| diff --git a/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst b/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst index dc2a129c207ec5..6696ee663a8a30 100644 --- a/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst +++ b/docs/notebooks/llava-next-multimodal-chatbot-with-output.rst @@ -59,9 +59,9 @@ Prerequisites .. code:: ipython3 - # %pip install -q "nncf>=2.14.0" "torch>=2.1" "transformers>=4.39.1" "accelerate" "pillow" "gradio>=4.26" "datasets>=2.14.6" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu - # %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5" - # %pip install -q "git+https://github.com/hugggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "nncf>=2.14.0" "torch>=2.1" "transformers>=4.39.1" "accelerate" "pillow" "gradio>=4.26" "datasets>=2.14.6" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5" + %pip install -q "git+https://github.com/hugggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu .. code:: ipython3 diff --git a/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst b/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst index 6aa437b9f2d37a..8f94b7ce67973a 100644 --- a/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst +++ b/docs/notebooks/llm-agent-rag-llamaindex-with-output.rst @@ -230,7 +230,7 @@ code: if repo_name == "OpenVINO": hf_hub.snapshot_download(llm_model_id.value, local_dir=llm_model_path) else: - !optimum_cli(llm_model_id.value, llm_model_path, additional_args=-{"task": "text-generation-with-past", "weight-format": "int4"}) + optimum_cli(llm_model_id.value, llm_model_path, additional_args=-{"task": "text-generation-with-past", "weight-format": "int4"}) Download Embedding model ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/notebooks/llm-agent-react-langchain-with-output.rst b/docs/notebooks/llm-agent-react-langchain-with-output.rst index 2b1b289f90db0b..9adb0311542426 100644 --- a/docs/notebooks/llm-agent-react-langchain-with-output.rst +++ b/docs/notebooks/llm-agent-react-langchain-with-output.rst @@ -66,6 +66,29 @@ Prerequisites +.. code:: ipython3 + + import requests + from pathlib import Path + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + + if not Path("cmd_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py") + open("cmd_helper.py", "w", encoding="utf-8").write(r.text) + + + + +.. parsed-literal:: + + 1491 + + + .. code:: ipython3 import os @@ -74,16 +97,27 @@ Prerequisites %pip install -Uq pip %pip uninstall -q -y optimum optimum-intel - %pip install --pre -Uq "openvino>=2024.2.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu \ + %pip install --pre -Uq "openvino>=2024.5.0" openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "transformers>=4.38.1" "langchain>=0.2.3" "langchain-huggingface>=0.1.2" "langchain-community>=0.2.4" "Wikipedia" \ "torch>=2.1" \ "datasets" \ "accelerate" \ + "pydantic<2.10.0" \ "gradio>=4.19" - %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "transformers>=4.38.1" "langchain>=0.2.3" "langchain-community>=0.2.4" "Wikipedia" %pip install -q "git+https://github.com/huggingface/optimum-intel.git" \ "git+https://github.com/openvinotoolkit/nncf.git" + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + + Create a tools -------------- @@ -178,7 +212,7 @@ previous agent tool invocations and the corresponding tool outputs. .. code:: ipython3 - PREFIX = """[INST]Respond to the human as helpfully and accurately as possible. You have access to the following tools:""" + PREFIX = """Respond to the human as helpfully and accurately as possible. You have access to the following tools:""" FORMAT_INSTRUCTIONS = """Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input). @@ -210,10 +244,10 @@ previous agent tool invocations and the corresponding tool outputs. "action": "Final Answer", "action_input": "Final response to human" }}}} - ```[/INST]""" + ```""" SUFFIX = """Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:. - Thought:[INST]""" + Thought:""" HUMAN_MESSAGE_TEMPLATE = "{input}\n\n{agent_scratchpad}" @@ -225,18 +259,32 @@ Create LLM Large Language Models (LLMs) are a core component of LangChain. LangChain does not serve its own LLMs, but rather provides a standard interface for interacting with many different LLMs. In this example, we -select ``Mistral-7B-Instruct-v0.3`` as LLM in agent pipeline. - -- **Mistral-7B-Instruct-v0.3** - The Mistral-7B-Instruct-v0.3 Large - Language Model (LLM) is an instruct fine-tuned version of the - Mistral-7B-v0.3. You can find more details about model in the `model - card `__, - `paper `__ and `release blog - post `__. +select following models as LLM in agent pipeline. + +- **qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - + Qwen2.5 is the latest series of Qwen large language models. Comparing + with Qwen2, Qwen2.5 series brings significant improvements in coding, + mathematics and general knowledge skills. Additionally, it brings + long-context and multiple languages support including Chinese, + English, French, Spanish, Portuguese, German, Italian, Russian, + Japanese, Korean, Vietnamese, Thai, Arabic, and more. For more + details, please refer to + `model_card `__, + `blog `__, + `GitHub `__, and + `Documentation `__. +- **llama-3.1-8b-instruct** - The Llama 3.1 instruction tuned text only + models (8B, 70B, 405B) are optimized for multilingual dialogue use + cases and outperform many of the available open source and closed + chat models on common industry benchmarks. More details about model + can be found in `Meta blog + post `__, `model + website `__ and `model + card `__. >\ **Note**: run model with demo, you will need to accept license agreement. >You must be a registered user in Hugging Face Hub. Please visit `HuggingFace model - card `__, + card `__, carefully read terms of usage and click accept button. You will need to use an access token for the code below to run. For more information on access tokens, refer to `this section of the @@ -269,31 +317,52 @@ folder. .. code:: ipython3 - from pathlib import Path + import ipywidgets as widgets + + llm_model_ids = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2.5-3B-Instruct", "Qwen/qwen2.5-14b-instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct"] - model_id = "mistralai/Mistral-7B-Instruct-v0.3" - model_path = "Mistral-7B-Instruct-v0.3-ov-int4" + llm_model_id = widgets.Dropdown( + options=llm_model_ids, + value=llm_model_ids[0], + description="Model:", + disabled=False, + ) - if not Path(model_path).exists(): - !optimum-cli export openvino --model {model_id} --task text-generation-with-past --trust-remote-code --weight-format int4 {model_path} + llm_model_id -Select inference device for LLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. parsed-literal:: + + Dropdown(description='Model:', options=('Qwen/Qwen2.5-7B-Instruct', 'Qwen/Qwen2.5-3B-Instruct', 'Qwen/qwen2.5-… .. code:: ipython3 - import requests + from cmd_helper import optimum_cli - r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", - ) - open("notebook_utils.py", "w").write(r.text) + llm_model_path = llm_model_id.value.split("/")[-1] + repo_name = llm_model_id.value.split("/")[0] + if not Path(llm_model_path).exists(): + optimum_cli( + llm_model_id.value, llm_model_path, additional_args={"task": "text-generation-with-past", "weight-format": "int4", "group-size": "128", "ratio": "1.0"} + ) + +Select inference device for LLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + from notebook_utils import device_widget device = device_widget("CPU", exclude=["NPU"]) + + device @@ -312,7 +381,7 @@ information `__. .. code:: ipython3 - from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline + from langchain_huggingface import HuggingFacePipeline from transformers.generation.stopping_criteria import StoppingCriteriaList, StoppingCriteria import openvino.properties as props @@ -346,7 +415,7 @@ information `__. stop_tokens = ["Observation:"] ov_llm = HuggingFacePipeline.from_model_id( - model_id=model_path, + model_id=llm_model_path, task="text-generation", backend="openvino", model_kwargs={ @@ -356,26 +425,16 @@ information `__. }, pipeline_kwargs={"max_new_tokens": 2048}, ) - ov_llm = ov_llm.bind(skip_prompt=True, stop=["Observation:"]) tokenizer = ov_llm.pipeline.tokenizer ov_llm.pipeline._forward_params["stopping_criteria"] = StoppingCriteriaList([StopSequenceCriteria(stop_tokens, tokenizer)]) +.. code:: ipython3 -.. parsed-literal:: - - 2024-06-07 23:17:16.804739: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-06-07 23:17:16.807973: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used. - 2024-06-07 23:17:16.850235: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered - 2024-06-07 23:17:16.850258: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered - 2024-06-07 23:17:16.850290: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered - 2024-06-07 23:17:16.859334: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. - To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - 2024-06-07 23:17:17.692415: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT - You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers - The argument `trust_remote_code` is to be used along with export=True. It will be ignored. - Compiling the model to GPU ... - + from langchain_huggingface import ChatHuggingFace + + ov_chat = ChatHuggingFace(llm=ov_llm, verbose=True) + ov_chat = ov_chat.bind(skip_prompt=True, stop=["Observation:"]) You can get additional inference speed improvement with `Dynamic Quantization of activations and KV-cache quantization on @@ -409,7 +468,7 @@ outputs back to the agent, and repeats. from langchain.agents import AgentExecutor, StructuredChatAgent agent = StructuredChatAgent.from_llm_and_tools( - ov_llm, + ov_chat, tools, prefix=PREFIX, suffix=SUFFIX, @@ -438,57 +497,68 @@ prompt template. > Entering new AgentExecutor chain... - Thought: I can use the exponentiate and add tools to solve the first part, and then use the multiply tool for the second part, and finally the exponentiate tool again to square the result. + Thought: First, we need to take 3 to the fifth power. Then we will find the sum of twelve and three. After that, we multiply the first result by the second result. Finally, we'll square the whole result. Action: ``` { "action": "exponentiate", - "action_input": {"base": 3, "exponent": 5} + "action_input": { + "base": 3, + "exponent": 5 + } } ``` Observation: Observation: 243 - Thought: Now I need to add twelve and three + Thought:Next, let's find the sum of twelve and three. Action: ``` { "action": "add", - "action_input": {"first_int": 12, "second_int": 3} + "action_input": { + "first_int": 12, + "second_int": 3 + } } ``` Observation: Observation: 15 - Thought: Now I need to multiply the result by 243 + Thought:Now, we will multiply the result of \(3^5\) (which is 243) by the sum of 12 and 3 (which is 15). Action: ``` { "action": "multiply", - "action_input": {"first_int": 243, "second_int": 15} + "action_input": { + "first_int": 243, + "second_int": 15 + } } ``` Observation: Observation: 3645 - Thought: Finally, I need to square the result + Thought:Thought: Now, we need to square the result of the multiplication (3645). Action: ``` { "action": "exponentiate", - "action_input": {"base": 3645, "exponent": 2} + "action_input": { + "base": 3645, + "exponent": 2 + } } ``` - Observation: Observation: 13286025 - Thought: I know what to respond + Thought:Thought: I know what to respond Action: ``` { "action": "Final Answer", - "action_input": "The final answer is 13286025" + "action_input": "The final result is 13286025." } ``` @@ -500,7 +570,7 @@ prompt template. .. parsed-literal:: {'input': 'Take 3 to the fifth power and multiply that by the sum of twelve and three, then square the whole result', - 'output': 'The final answer is 13286025'} + 'output': 'The final result is 13286025.'} @@ -566,7 +636,7 @@ words generated by agent. .. parsed-literal:: - 'Page: OpenVINO\nSummary: OpenVINO is an open-source software toolkit for optimizing and deploying deep learning models. It enables programmers to develop scalable and efficient AI solutions with relatively few lines of code. It supports several popular model formats and categories, such as large language models, computer vision, and generative AI.\nActively developed by Intel, it prioritizes high-performance inference on Intel hardware but also supports ARM/ARM64 processors and encourages contributors to add new devices to the portfolio.\nBased in C++, it offers the following APIs: C/C++, Python, and Node.js (an early preview).\nOpenVINO is cross-platform and free for use under Apache License 2.0.\n\nPage: Stable Diffusion\nSummary: Stable Diffusion is a deep learning, text-to-image model released in 2022 based on diffusion techniques. It is considered to be a part of the ongoing artificial intelligence boom.\nIt is primarily used to generate detailed images conditioned on text descriptions, t' + 'Page: OpenVINO\nSummary: OpenVINO is an open-source software toolkit for optimizing and deploying deep learning models. It enables programmers to develop scalable and efficient AI solutions with relatively few lines of code. It supports several popular model formats and categories, such as large language models, computer vision, and generative AI.\nActively developed by Intel, it prioritizes high-performance inference on Intel hardware but also supports ARM/ARM64 processors and encourages contributors to add new devices to the portfolio.\nBased in C++, it offers the following APIs: C/C++, Python, and Node.js (an early preview).\nOpenVINO is cross-platform and free for use under Apache License 2.0.\n\nPage: Audacity (audio editor)\nSummary: Audacity is a free and open-source digital audio editor and recording application software, available for Windows, macOS, Linux, and other Unix-like operating systems. \nAs of December 6, 2022, Audacity is the most popular download at FossHub, with over 114.' @@ -643,7 +713,7 @@ In this examples, we will create 2 customized tools for .. parsed-literal:: - "{'current_condition': {'temp_C': '9', 'FeelsLikeC': '8', 'humidity': '93', 'weatherDesc': [{'value': 'Sunny'}], 'observation_time': '04:39 AM'}}" + "{'current_condition': {'temp_C': '0', 'FeelsLikeC': '-4', 'humidity': '86', 'weatherDesc': [{'value': 'Clear'}], 'observation_time': '12:16 AM'}}" @@ -657,7 +727,7 @@ Create AI agent demo with Gradio UI tools = [wikipedia, painting, weather] agent = StructuredChatAgent.from_llm_and_tools( - ov_llm, + ov_chat, tools, prefix=PREFIX, suffix=SUFFIX, @@ -703,7 +773,7 @@ Create AI agent demo with Gradio UI def request_cancel(): - ov_llm.pipeline.model.request.cancel() + ov_chat.llm.pipeline.model.request.cancel() .. code:: ipython3 @@ -723,50 +793,6 @@ Create AI agent demo with Gradio UI # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` # To learn more please refer to the Gradio docs: https://gradio.app/docs/ - -.. parsed-literal:: - - - - > Entering new AgentExecutor chain... - Thought: I need to use the weather tool to get the current weather in London, then use the painting tool to generate a picture of Big Ben based on the weather information. - - Action: - ``` - { - "action": "weather", - "action_input": "London" - } - ``` - - Observation: - Observation: {'current_condition': {'temp_C': '9', 'FeelsLikeC': '8', 'humidity': '93', 'weatherDesc': [{'value': 'Sunny'}], 'observation_time': '04:39 AM'}} - Thought: I have the current weather in London. Now I can use the painting tool to generate a picture of Big Ben based on the weather information. - - Action: - ``` - { - "action": "painting", - "action_input": "Big Ben, sunny day" - } - ``` - - Observation: - Observation: {image_url: "https://image.pollinations.ai/prompt/Big%20Ben%2C%20sunny%20day"} - Thought: I have the image URL of Big Ben on a sunny day. Now I can respond to the human with the image URL. - - Action: - ``` - { - "action": "Final Answer", - "action_input": "Here is the image of Big Ben on a sunny day: https://image.pollinations.ai/prompt/Big%20Ben%2C%20sunny%20day" - } - ``` - Observation: - - > Finished chain. - - .. code:: ipython3 # please uncomment and run this cell for stopping gradio interface diff --git a/docs/notebooks/llm-agent-react-with-output.rst b/docs/notebooks/llm-agent-react-with-output.rst index aced34d99d90bd..791355276fd2fd 100644 --- a/docs/notebooks/llm-agent-react-with-output.rst +++ b/docs/notebooks/llm-agent-react-with-output.rst @@ -62,22 +62,22 @@ Prerequisites import os import requests - - + + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) open("notebook_utils.py", "w").write(r.text) - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/pip_helper.py", ) open("pip_helper.py", "w").write(r.text) - + os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - + from pip_helper import pip_install - + pip_install( "-q", "--extra-index-url", @@ -106,9 +106,7 @@ folder. Large Language Models (LLMs) are a core component of agent. LlamaIndex does not serve its own LLMs, but rather provides a standard interface for interacting with many different LLMs. In this example, we can select -``Qwen2.5`` as LLM in agent pipeline. - - +``Qwen2.5`` as LLM in agent pipeline. \* **qwen2.5-3b-instruct/qwen2.5-7b-instruct/qwen2.5-14b-instruct** - Qwen2.5 is the latest series of Qwen large language models. Comparing with Qwen2, Qwen2.5 series brings significant improvements in coding, @@ -124,16 +122,16 @@ Vietnamese, Thai, Arabic, and more. For more details, please refer to .. code:: ipython3 import ipywidgets as widgets - + llm_model_ids = ["Qwen/Qwen2.5-3B-Instruct", "Qwen/Qwen2.5-7B-Instruct", "Qwen/qwen2.5-14b-instruct"] - + llm_model_id = widgets.Dropdown( options=llm_model_ids, value=llm_model_ids[0], description="Model:", disabled=False, ) - + llm_model_id @@ -148,9 +146,9 @@ Vietnamese, Thai, Arabic, and more. For more details, please refer to .. code:: ipython3 from pathlib import Path - + llm_model_path = llm_model_id.value.split("/")[-1] - + if not Path(llm_model_path).exists(): !optimum-cli export openvino --model {llm_model_id.value} --task text-generation-with-past --trust-remote-code --weight-format int4 --group-size 128 --ratio 1.0 --sym {llm_model_path} @@ -162,9 +160,9 @@ Select inference device for LLM .. code:: ipython3 from notebook_utils import device_widget - + llm_device = device_widget("CPU", exclude=["NPU"]) - + llm_device @@ -228,15 +226,15 @@ guide `__ import openvino.properties as props import openvino.properties.hint as hints import openvino.properties.streams as streams - + import json import json5 import torch - + tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True) - + ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""} - + llm = OVModelForCausalLM.from_pretrained( llm_model_path, device=llm_device.value, @@ -244,7 +242,7 @@ guide `__ config=AutoConfig.from_pretrained(llm_model_path, trust_remote_code=True), trust_remote_code=True, ) - + llm.generation_config.top_k = 1 llm.generation_config.max_length = 2000 @@ -262,31 +260,31 @@ received from tool calling.. class StopSequenceCriteria(StoppingCriteria): """ This class can be used to stop generation whenever a sequence of tokens is encountered. - + Args: stop_sequences (`str` or `List[str]`): The sequence (or list of sequences) on which to stop execution. tokenizer: The tokenizer used to decode the model outputs. """ - + def __init__(self, stop_sequences, tokenizer): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] self.stop_sequences = stop_sequences self.tokenizer = tokenizer - + def __call__(self, input_ids, scores, **kwargs) -> bool: decoded_output = self.tokenizer.decode(input_ids.tolist()[0]) return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences) - - + + def text_completion(prompt: str, stop_words) -> str: im_end = "<|im_end|>" if im_end not in stop_words: stop_words = stop_words + [im_end] streamer = TextStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) - + stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop_words, tokenizer)]) input_ids = torch.tensor([tokenizer.encode(prompt)]) generate_kwargs = dict( @@ -299,7 +297,7 @@ received from tool calling.. output = tokenizer.decode(output, errors="ignore") assert output.startswith(prompt) output = output[len(prompt) :].replace("<|endoftext|>", "").replace(im_end, "") - + for stop_str in stop_words: idx = output.find(stop_str) if idx != -1: @@ -341,13 +339,13 @@ parameter should be a sequence of messages that contains the .. code:: ipython3 TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}""" - + PROMPT_REACT = """Answer the following questions as best you can. You have access to the following APIs: - + {tools_text} - + Use the following format: - + Question: the input question you must answer Thought: you should always think about what to do Action: the action to take, should be one of [{tools_name_text}] @@ -356,9 +354,9 @@ parameter should be a sequence of messages that contains the ... (this Thought/Action/Action Input/Observation can be repeated zero or more times) Thought: I now know the final answer Final Answer: the final answer to the original input question - + Begin! - + Question: {query}""" Meanwhile we have to create function for consolidate the tools @@ -383,9 +381,9 @@ information and conversation history into the prompt template. raise NotImplementedError tools_text.append(tool) tools_text = "\n\n".join(tools_text) - + tools_name_text = ", ".join([tool_info["name_for_model"] for tool_info in list_of_tool_info]) - + messages = [{"role": "system", "content": "You are a helpful assistant."}] for i, (query, response) in enumerate(chat_history): if list_of_tool_info: @@ -399,9 +397,9 @@ information and conversation history into the prompt template. messages.append({"role": "user", "content": query}) if response: messages.append({"role": "assistant", "content": response}) - + prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, return_tensors="pt") - + return prompt Create parser @@ -495,7 +493,7 @@ execute them according to the output of LLM. return str(ret) elif tool_name == "image_gen": import urllib.parse - + tool_args = tool_args.replace("(", "").replace(")", "") prompt = json5.loads(tool_args)["prompt"] prompt = urllib.parse.quote(prompt) @@ -505,11 +503,11 @@ execute them according to the output of LLM. ) else: raise NotImplementedError - - + + def llm_with_tool(prompt: str, history, list_of_tool_info=()): chat_history = [(x["user"], x["bot"]) for x in history] + [(prompt, "")] - + planning_prompt = build_input_text(chat_history, list_of_tool_info) text = "" while True: @@ -524,7 +522,7 @@ execute them according to the output of LLM. else: text += output break - + new_history = [] new_history.extend(history) new_history.append({"user": prompt, "bot": text}) @@ -539,7 +537,7 @@ Run agent history = [] query = "get the weather in London, and create a picture of Big Ben based on the weather information" - + response, history = llm_with_tool(prompt=query, history=history, list_of_tool_info=tools) diff --git a/docs/notebooks/llm-chatbot-generate-api-with-output.rst b/docs/notebooks/llm-chatbot-generate-api-with-output.rst index 817a34011fde2d..c09b463ae985d0 100644 --- a/docs/notebooks/llm-chatbot-generate-api-with-output.rst +++ b/docs/notebooks/llm-chatbot-generate-api-with-output.rst @@ -81,9 +81,9 @@ Install required dependencies .. code:: ipython3 import os - + os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" - + %pip install -Uq pip %pip uninstall -q -y optimum optimum-intel %pip install -q -U "openvino>=2024.3.0" openvino-tokenizers[transformers] openvino-genai @@ -103,12 +103,12 @@ Install required dependencies from pathlib import Path import requests import shutil - + # fetch model configuration - + config_shared_path = Path("../../utils/llm_config.py") config_dst_path = Path("llm_config.py") - + if not config_dst_path.exists(): if config_shared_path.exists(): try: @@ -127,7 +127,7 @@ Install required dependencies r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py") with open("llm_config.py", "w", encoding="utf-8") as f: f.write(r.text) - + if not Path("notebook_utils.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") open("notebook_utils.py", "w").write(r.text) @@ -238,7 +238,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -270,7 +270,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -304,7 +304,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -338,7 +338,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -399,7 +399,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -432,7 +432,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -466,7 +466,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -500,7 +500,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -531,7 +531,7 @@ Click here to see available models options .. code:: python - # login to huggingfacehub to get access to pretrained model + # login to huggingfacehub to get access to pretrained model from huggingface_hub import notebook_login, whoami @@ -644,9 +644,9 @@ Click here to see available models options .. code:: ipython3 from llm_config import get_llm_selection_widget - + form, lang, model_id_widget, compression_variant, use_preconverted = get_llm_selection_widget() - + form @@ -668,7 +668,7 @@ Click here to see available models options .. parsed-literal:: Selected model qwen2-0.5b-instruct with INT4 compression - + Convert model using Optimum-CLI tool ------------------------------------ @@ -676,7 +676,7 @@ Convert model using Optimum-CLI tool `Optimum Intel `__ -is the interface between the +is the interface between the `Transformers `__ and `Diffusers `__ libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. @@ -749,13 +749,12 @@ to make it `symmetric `__ you can add ``--sym``. -For INT4 quantization you can also specify the following arguments: - -- The ``--group-size`` parameter will define the group size to use for - quantization, -1 it will results in per-column quantization. -- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit - quantization. If set to 0.9, it means that 90% of the layers will be - quantized to int4 while 10% will be quantized to int8. +For INT4 quantization you can also specify the following arguments : - +The ``--group-size`` parameter will define the group size to use for +quantization, -1 it will results in per-column quantization. - The +``--ratio`` parameter controls the ratio between 4-bit and 8-bit +quantization. If set to 0.9, it means that 90% of the layers will be +quantized to int4 while 10% will be quantized to int8. Smaller group_size and ratio values usually improve accuracy at the sacrifice of the model size and inference latency. You can enable AWQ to @@ -777,28 +776,28 @@ be additionally applied during model export with INT4 precision using .. code:: ipython3 from llm_config import convert_and_compress_model - + model_dir = convert_and_compress_model(model_id, model_configuration, compression_variant.value, use_preconverted.value) .. parsed-literal:: ✅ INT4 qwen2-0.5b-instruct model already converted and can be found in qwen2/INT4_compressed_weights - + Let’s compare model size for different compression types .. code:: ipython3 from llm_config import compare_model_size - + compare_model_size(model_dir) .. parsed-literal:: Size of model with INT4 compressed weights is 358.86 MB - + Select device for inference --------------------------- @@ -808,9 +807,9 @@ Select device for inference .. code:: ipython3 from notebook_utils import device_widget - + device = device_widget(default="CPU", exclude=["NPU"]) - + device @@ -853,14 +852,14 @@ of the available generation parameters more deeply later. .. code:: ipython3 import openvino_genai as ov_genai - + print(f"Loading model from {model_dir}\n") - - + + pipe = ov_genai.LLMPipeline(str(model_dir), device.value) - + generation_config = pipe.get_generation_config() - + input_prompt = "The Sun is yellow bacause" print(f"Input text: {input_prompt}") print(pipe.generate(input_prompt, max_new_tokens=10)) @@ -869,10 +868,10 @@ of the available generation parameters more deeply later. .. parsed-literal:: Loading model from qwen2/INT4_compressed_weights - + Input text: The Sun is yellow bacause it is made of hydrogen and oxygen atoms. The - + Run Chatbot ----------- @@ -1023,11 +1022,11 @@ Click here to see detailed description of advanced options if not Path("gradio_helper_genai.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-chatbot/gradio_helper_genai.py") open("gradio_helper_genai.py", "w").write(r.text) - + from gradio_helper_genai import make_demo - + demo = make_demo(pipe, model_configuration, model_id, lang.value) - + try: demo.launch(debug=True) except Exception: diff --git a/docs/notebooks/llm-chatbot-with-output.rst b/docs/notebooks/llm-chatbot-with-output.rst index 88dda48053d8ec..0d214f5cccc0fc 100644 --- a/docs/notebooks/llm-chatbot-with-output.rst +++ b/docs/notebooks/llm-chatbot-with-output.rst @@ -655,14 +655,13 @@ to make it `symmetric `__ you can add ``--sym``. -For INT4 quantization you can also specify the following arguments: - -- -The ``--group-size`` parameter will define the group size to use for -quantization, -1 it will results in per-column quantization. - The -``--ratio`` parameter controls the ratio between 4-bit and 8-bit -quantization. If set to 0.9, it means that 90% of the layers will be -quantized to int4 while 10% will be quantized to int8. +For INT4 quantization you can also specify the following arguments : + +- The ``--group-size`` parameter will define the group size to use for + quantization, -1 it will results in per-column quantization. +- The ``--ratio`` parameter controls the ratio between 4-bit and 8-bit + quantization. If set to 0.9, it means that 90% of the layers will be + quantized to int4 while 10% will be quantized to int8. Smaller group_size and ratio values usually improve accuracy at the sacrifice of the model size and inference latency. diff --git a/docs/notebooks/localai-with-output.rst b/docs/notebooks/localai-with-output.rst new file mode 100644 index 00000000000000..fac17b8d241d82 --- /dev/null +++ b/docs/notebooks/localai-with-output.rst @@ -0,0 +1,220 @@ +LocalAI and OpenVINO +==================== + +`LocalAI `__ is the free, Open Source OpenAI +alternative. LocalAI act as a drop-in replacement REST API that’s +compatible with OpenAI API specifications for local inferencing. It +allows you to run LLMs, generate images, audio (and not only) locally or +on-prem with consumer grade hardware, supporting multiple model families +and architectures. Does not require GPU. It is created and maintained by +``Ettore Di Giacinto``. + +In this tutorial we show how to prepare a model config and launch an +OpenVINO LLM model with LocalAI in docker container. + + +**Table of contents:** + + +- `Prepare Docker <#prepare-docker>`__ +- `Prepare a model <#prepare-a-model>`__ +- `Run the server <#run-the-server>`__ +- `Send a client request <#send-a-client-request>`__ +- `Stop the server <#stop-the-server>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prepare Docker +-------------- + +Install `Docker +Engine `__, including its +`post-installation `__ +steps, on your development system. To verify installation, test it, +using the following command. When it is ready, it will display a test +image and a message. + +.. code:: ipython3 + + !docker run hello-world + + +.. parsed-literal:: + + Unable to find image 'hello-world:latest' locally + latest: Pulling from library/hello-world + + Digest: sha256:305243c734571da2d100c8c8b3c3167a098cab6049c9a5b066b6021a60fcb966 + Status: Downloaded newer image for hello-world:latest + + Hello from Docker! + This message shows that your installation appears to be working correctly. + + To generate this message, Docker took the following steps: + 1. The Docker client contacted the Docker daemon. + 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. + (amd64) + 3. The Docker daemon created a new container from that image which runs the + executable that produces the output you are currently reading. + 4. The Docker daemon streamed that output to the Docker client, which sent it + to your terminal. + + To try something more ambitious, you can run an Ubuntu container with: + $ docker run -it ubuntu bash + + Share images, automate workflows, and more with a free Docker ID: + https://hub.docker.com/ + + For more examples and ideas, visit: + https://docs.docker.com/get-started/ + + + +Prepare a model +~~~~~~~~~~~~~~~ + + + +LocalAI allows to use customized models. For more details you can read +the +`instruction `__ +where you can also find the detailed documentation. We will use one of +the OpenVINO optimized LLMs in the collection on the `collection on +🤗Hugging +Face `__. +In this example we will use +`TinyLlama-1.1B-Chat-v1.0-fp16-ov `__. +First of all we should create a model configuration file: + +.. code:: yaml + + name: TinyLlama-1.1B-Chat-v1.0-fp16-ov + backend: transformers + parameters: + model: OpenVINO/TinyLlama-1.1B-Chat-v1.0-fp16-ov + temperature: 0.2 + top_k: 40 + top_p: 0.95 + max_new_tokens: 32 + + type: OVModelForCausalLM + + template: + chat_message: | + <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}} + {{if .Content}}{{.Content}}{{end}}<|im_end|> + chat: | + {{.Input}} + <|im_start|>assistant + + completion: | + {{.Input}} + + stopwords: + - <|im_end|> + +The fields ``backend``, ``model``, ``type`` you can find in the code +example on the model page (we added the corresponding comments): + +.. code:: python + + from transformers import AutoTokenizer # backend + from optimum.intel.openvino import OVModelForCausalLM # type + + model_id = "OpenVINO/TinyLlama-1.1B-Chat-v1.0-fp16-ov" # parameters.model + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = OVModelForCausalLM.from_pretrained(model_id) + +The name you can choose by yourself. By this name you will specify what +model to use on the client side. + +You can create a GitHub gist and modify fields: +`ov.yaml `__ + +Description of the parameters used in config YAML file can be found +`here `__. + +The most important: + +- ``name`` - model name, used to identify the model in API calls. +- ``backend`` - backend to use for computation (like llama-cpp, + diffusers, whisper, transformers). +- ``parameters.model`` - relative to the models path. +- ``temperature``, ``top_k``, ``top_p``, ``max_new_tokens`` - + parameters for the model. +- ``type`` - type of configuration, often related to the type of task + or model architecture. +- ``template`` - templates for various types of model interactions. +- ``stopwords`` - Words or phrases that halts processing. + +Run the server +~~~~~~~~~~~~~~ + + + +Everything is ready for launch. Use +``quay.io/go-skynet/local-ai:v2.23.0-ffmpeg`` image that contains all +required dependencies. For more details read `Run with container +images `__. +If you want to see the output remove the ``-d`` flag and send a client +request from a separate notebook. + +.. code:: ipython3 + + !docker run -d --rm --name="localai" -p 8080:8080 quay.io/go-skynet/local-ai:master-sycl-f16-ffmpeg https://gist.githubusercontent.com/aleksandr-mokrov/f007c8fa6036760a856ddc60f605a0b0/raw/9d24ceeb487f9c058a943113bd0290e8ae565b3e/ov.yaml + + +.. parsed-literal:: + + 67e1a2a8123aa15794c027278aed2c258a04e06883663459bbeaca22ff014740 + docker: Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running hook #1: error running hook: exit status 1, stdout: , stderr: Auto-detected mode as 'legacy' + nvidia-container-cli: requirement error: invalid expression: unknown. + + +Check whether the ``localai`` container is running normally: + +.. code:: ipython3 + + !docker ps | grep localai + +Send a client request +~~~~~~~~~~~~~~~~~~~~~ + + + +Now you can send HTTP requests using the model name +``TinyLlama-1.1B-Chat-v1.0-fp16-ov``. More details how to use `OpenAI +API `__. + +.. code:: ipython3 + + !curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{"model": "TinyLlama-1.1B-Chat-v1.0-fp16-ov", "prompt": "What is OpenVINO?"}' + + +.. parsed-literal:: + + curl: (7) Failed to connect to localhost port 8080: Connection refused + + +Stop the server +~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + !docker stop localai + + +.. parsed-literal:: + + Error response from daemon: No such container: localai + diff --git a/docs/notebooks/magika-content-type-recognition-with-output.rst b/docs/notebooks/magika-content-type-recognition-with-output.rst index 383fdc6eebf499..f15167eae183b1 100644 --- a/docs/notebooks/magika-content-type-recognition-with-output.rst +++ b/docs/notebooks/magika-content-type-recognition-with-output.rst @@ -41,8 +41,8 @@ post `__ - `Define model loading class <#define-model-loading-class>`__ diff --git a/docs/notebooks/meter-reader-with-output.rst b/docs/notebooks/meter-reader-with-output.rst index 713c4d68edae6a..0ac9308155d4b7 100644 --- a/docs/notebooks/meter-reader-with-output.rst +++ b/docs/notebooks/meter-reader-with-output.rst @@ -135,7 +135,7 @@ DeepLabV3P pre-trained models from PaddlePaddle community. .. parsed-literal:: - model/meter_det_model.tar.gz: 0%| | 0.00/192M [00:00 + diff --git a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst b/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst index 7f64dd936292c5..c130f9e0c08d67 100644 --- a/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst +++ b/docs/notebooks/minicpm-v-multimodal-chatbot-with-output.rst @@ -205,7 +205,7 @@ documentation [68 lines of output] + ╰─> [92 lines of output] Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment Collecting setuptools Using cached setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB) Collecting cython<3.0,>=0.25 Using cached Cython-0.29.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB) Collecting cymem<2.1.0,>=2.0.2 - Using cached cymem-2.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB) + Using cached cymem-2.0.10.tar.gz (10 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting preshed<3.1.0,>=3.0.2 Using cached preshed-3.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB) Collecting murmurhash<1.1.0,>=0.28.0 - Using cached murmurhash-1.0.10-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB) + Using cached murmurhash-1.0.11.tar.gz (13 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting thinc<8.4.0,>=8.3.0 Using cached thinc-8.3.2.tar.gz (193 kB) Installing build dependencies: started @@ -139,16 +151,28 @@ Prerequisites × pip subprocess to install build dependencies did not run successfully. │ exit code: 1 - ╰─> [38 lines of output] + ╰─> [50 lines of output] Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment Collecting setuptools Using cached setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB) Collecting cython<3.0,>=0.25 Using cached Cython-0.29.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (3.1 kB) Collecting murmurhash<1.1.0,>=1.0.2 - Using cached murmurhash-1.0.10-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB) + Using cached murmurhash-1.0.11.tar.gz (13 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting cymem<2.1.0,>=2.0.2 - Using cached cymem-2.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB) + Using cached cymem-2.0.10.tar.gz (10 kB) + Installing build dependencies: started + Installing build dependencies: finished with status 'done' + Getting requirements to build wheel: started + Getting requirements to build wheel: finished with status 'done' + Preparing metadata (pyproject.toml): started + Preparing metadata (pyproject.toml): finished with status 'done' Collecting preshed<3.1.0,>=3.0.2 Using cached preshed-3.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB) Collecting blis<1.1.0,>=1.0.0 @@ -164,7 +188,7 @@ Prerequisites Using cached setuptools-75.3.0-py3-none-any.whl.metadata (6.9 kB) Collecting cython>=0.25 Using cached Cython-3.0.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB) - ERROR: Ignored the following versions that require a different python version: 1.25.0 Requires-Python >=3.9; 1.25.1 Requires-Python >=3.9; 1.25.2 Requires-Python >=3.9; 1.26.0 Requires-Python <3.13,>=3.9; 1.26.1 Requires-Python <3.13,>=3.9; 1.26.2 Requires-Python >=3.9; 1.26.3 Requires-Python >=3.9; 1.26.4 Requires-Python >=3.9; 2.0.0 Requires-Python >=3.9; 2.0.1 Requires-Python >=3.9; 2.0.2 Requires-Python >=3.9; 2.1.0 Requires-Python >=3.10; 2.1.0rc1 Requires-Python >=3.10; 2.1.1 Requires-Python >=3.10; 2.1.2 Requires-Python >=3.10; 2.1.3 Requires-Python >=3.10; 75.4.0 Requires-Python >=3.9; 75.5.0 Requires-Python >=3.9; 75.6.0 Requires-Python >=3.9 + ERROR: Ignored the following versions that require a different python version: 1.25.0 Requires-Python >=3.9; 1.25.1 Requires-Python >=3.9; 1.25.2 Requires-Python >=3.9; 1.26.0 Requires-Python <3.13,>=3.9; 1.26.1 Requires-Python <3.13,>=3.9; 1.26.2 Requires-Python >=3.9; 1.26.3 Requires-Python >=3.9; 1.26.4 Requires-Python >=3.9; 2.0.0 Requires-Python >=3.9; 2.0.1 Requires-Python >=3.9; 2.0.2 Requires-Python >=3.9; 2.1.0 Requires-Python >=3.10; 2.1.0rc1 Requires-Python >=3.10; 2.1.1 Requires-Python >=3.10; 2.1.2 Requires-Python >=3.10; 2.1.3 Requires-Python >=3.10; 2.2.0 Requires-Python >=3.10; 2.2.0rc1 Requires-Python >=3.10; 75.4.0 Requires-Python >=3.9; 75.5.0 Requires-Python >=3.9; 75.6.0 Requires-Python >=3.9 ERROR: Could not find a version that satisfies the requirement numpy<3.0.0,>=2.0.0 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 1.13.3, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 1.19.5, 1.20.0, 1.20.1, 1.20.2, 1.20.3, 1.21.0, 1.21.1, 1.21.2, 1.21.3, 1.21.4, 1.21.5, 1.21.6, 1.22.0, 1.22.1, 1.22.2, 1.22.3, 1.22.4, 1.23.0, 1.23.1, 1.23.2, 1.23.3, 1.23.4, 1.23.5, 1.24.0, 1.24.1, 1.24.2, 1.24.3, 1.24.4) ERROR: No matching distribution found for numpy<3.0.0,>=2.0.0 @@ -499,25 +523,25 @@ Prepare image gallery .. parsed-literal:: - data/red_panda.png: 0%| | 0.00/50.6k [00:00 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/modeling_marian.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/modeling_marian.py:166: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if ( - Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19 and openvino version <= 2024.4. Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO. + model.safetensors: 0%| | 0.00/312M [00:00 0.19 and openvino version <= 2024.4. Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO. + model.safetensors: 100%|█████████████████████| 312M/312M [00:04<00:00, 71.1MB/s] .. code:: ipython3 @@ -888,10 +919,10 @@ support searching in Chinese. .. parsed-literal:: - 2024-11-22 01:36:43.187797: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:36:43.213112: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:26:01.092495: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:26:01.118195: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses. warnings.warn("Recommended: pip install sacremoses.") @@ -1123,13 +1154,13 @@ models can require different optimal threshold for search. .. parsed-literal:: - data/car-detection.mp4: 0%| | 0.00/2.68M [00:00`__ is a +“Model-as-a-Service” (MaaS) platform that seeks to bring together most +advanced machine learning models from the AI community, and to +streamline the process of leveraging AI models in real applications. +Hundreds of models are made publicly available on ModelScope (700+ and +counting), covering the latest development in areas such as NLP, CV, +Audio, Multi-modality, and AI for Science, etc. Many of these models +represent the SOTA in their specific fields, and made their open-sourced +debut on ModelScope. + +This tutorial covers how to use the modelscope ecosystem within +OpenVINO. + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Convert models from ModelScope using OpenVINO Model Conversion + API <#convert-models-from-modelscope-using-openvino-model-conversion-api>`__ + + - `Select inference device for image + classification <#select-inference-device-for-image-classification>`__ + - `Run Image classification <#run-image-classification>`__ + +- `Convert ModelScope models using Optimum + Intel <#convert-modelscope-models-using-optimum-intel>`__ + + - `Select inference device for text + classification <#select-inference-device-for-text-classification>`__ + - `Perform text classification <#perform-text-classification>`__ + +- `Convert ModelScope models for usage with OpenVINO + GenAI <#convert-modelscope-models-for-usage-with-openvino-genai>`__ + + - `Select inference device for text + generation <#select-inference-device-for-text-generation>`__ + - `Run OpenVINO GenAI pipeline <#run-openvino-genai-pipeline>`__ + +Prerequisites +------------- + + + +.. code:: ipython3 + + import platform + + %pip install -q "torch>=2.1.1" "torchvision" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q modelscope addict oss2 simplejson sortedcontainers pillow opencv-python "datasets<=3.0.0" + %pip install -q "transformers>=4.45" "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -qU "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" "nncf>=2.14.0" + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" + +.. code:: ipython3 + + import requests + from pathlib import Path + + if not Path("notebook_utils.py").exists(): + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + +Convert models from ModelScope using OpenVINO Model Conversion API +------------------------------------------------------------------ + + + +Modelscope package provides API for initializing a model and loading a +set of pre-trained weights using the model text handle. Discovering a +desired model name is straightforward with `Modelscope models web +page `__, one can choose a model +solving a particular machine learning problem and even sort the models +by popularity and novelty. + +OpenVINO supports various types of models and frameworks via conversion +to OpenVINO Intermediate Representation (IR). `OpenVINO model conversion +API `__ +should be used for these purposes. ``ov.convert_model`` function accepts +original model instance and example input for tracing and returns +``ov.Model`` representing this model in OpenVINO framework. Converted +model can be used for saving on disk using ``ov.save_model`` function or +directly loading on device using ``core.complie_model``. + +As example, we will use +`tinynas `__ +image classification model. The code bellow demonstrates how to load +this model using Modelscope pipelines interface, convert it to OpenVINO +IR and then perform image classification on specified device. + +.. code:: ipython3 + + from pathlib import Path + + from modelscope.pipelines import pipeline + from modelscope.utils.constant import Tasks + import openvino as ov + import torch + import gc + + + cls_model_id = "iic/cv_tinynas_classification" + cls_model_path = Path(cls_model_id.split("/")[-1]) / "openvino_model.xml" + + if not cls_model_path.exists(): + # load Modelcope pipeline with model + image_classification = pipeline(Tasks.image_classification, model=cls_model_id) + # convert model to OpenVINO + ov_model = ov.convert_model(image_classification.model, example_input=torch.zeros((1, 3, 224, 224)), input=[1, 3, 224, 224]) + # save OpenVINO model on disk for next usage + ov.save_model(ov_model, cls_model_path) + del ov_model + del image_classification + gc.collect(); + + +.. parsed-literal:: + + 2024-11-12 19:08:10.199148: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-11-12 19:08:10.212253: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered + WARNING: All log messages before absl::InitializeLog() is called are written to STDERR + E0000 00:00:1731424090.226654 1605757 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered + E0000 00:00:1731424090.230976 1605757 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered + 2024-11-12 19:08:10.246563: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + + +Select inference device for image classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + cv_cls_device = device_widget("CPU") + + cv_cls_device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +Run Image classification +~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Model inference interface remains compatible with pipeline preprocessing +and postprocessing, so you can reuse these part of pipeline, but for +providing standalone experience, we will demonstrate how to use model +without pipeline. The code bellow defines utilities for image +preprocessing and postprocessing. + +.. code:: ipython3 + + from notebook_utils import download_file + from PIL import Image + from torchvision import transforms + + # prepare input data and output lables + img_url = "https://pailitao-image-recog.oss-cn-zhangjiakou.aliyuncs.com/mufan/img_data/maas_test_data/dog.png" + img_path = Path("dog.png") + + labels_url = "https://raw.githubusercontent.com/openvinotoolkit/open_model_zoo/master/data/dataset_classes/imagenet_2012.txt" + + labels_path = Path("imagenet_2012.txt") + + if not img_path.exists(): + download_file(img_url) + + if not labels_path.exists(): + download_file(labels_url) + + image = Image.open(img_path) + imagenet_classes = labels_path.open("r").read().splitlines() + + + # prepare image preprocessing + transforms_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + transform_list = [ + transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms_normalize, + ] + transformer = transforms.Compose(transform_list) + + # compile model + core = ov.Core() + + ov_model = core.compile_model(cls_model_path, cv_cls_device.value) + +Now, when we make all necessary preparations, we can run model +inference. + +.. code:: ipython3 + + import numpy as np + + # preprocess input + image_tensor = transformer(image) + + # run model inference + result = ov_model(image_tensor.unsqueeze(0))[0] + + # postprocess results + label_id = np.argmax(result[0]) + score = result[0][label_id] + + label = imagenet_classes[label_id] + + # visualize results + display(image) + print(f"Predicted label: {label}, score {score}") + + + +.. image:: modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png + + +.. parsed-literal:: + + Predicted label: n02099601 golden retriever, score 8.060977935791016 + + +Convert ModelScope models using Optimum Intel +--------------------------------------------- + + + +For models compatible with the `HuggingFace +Transformers `__ +library, we can use `Optimum +Intel `__ integration +to convert and run model. Optimum Intel is the interface between the +Transformers and Diffusers libraries and the different tools and +libraries provided by Intel to accelerate end-to-end pipelines on Intel +architectures. + +Optimum Intel provides a simple interface for optimizing your +Transformers and Diffusers models, converting them to the OpenVINO +Intermediate Representation (IR) format, and running inference using +OpenVINO Runtime, among other use cases. For running ModelScope models +using this interface we should download model from hub first. There are +several ways how to download models from Modelscope Hub, one of them is +usage of ``modelscope.snapshot_download`` function. This function +accepts model id from hub and optionally local directory (if not +provided, model will be downloaded to cache directory). + +After that, we can load model to Optimum Intel interface replacing the +``AutoModelForXxx`` class from transformers with the corresponding +``OVModelForXxx``. Model conversion will be performed on the fly. For +avoiding next time conversion, we can save model on disk using +``save_pretrained`` method and in the next time pass directory with +already converted model as argument in ``from_pretrained`` method. We +also specified ``device`` parameter for compiling the model on the +specific device, if not provided, the default device will be used. The +device can be changed later in runtime using ``model.to(device)``, +please note that it may require some time for model compilation on a +newly selected device. In some cases, it can be useful to separate model +initialization and compilation, for example, if you want to reshape the +model using ``reshape`` method, you can postpone compilation, providing +the parameter ``compile=False`` into ``from_pretrained`` method, +compilation can be performed manually using ``compile`` method or will +be performed automatically during first inference run. + +As example, we will use +`nlp_bert_sentiment-analysis_english-base `__. +This model was trained for classification input text on 3 sentiment +categories: negative, positive and neutral. In transformers, +``AutoModelForSequenceClassification`` should be used for model +initialization, so for usage model with OpenVINO, it is enough just +replace ``AutoModelForSequenceClassification`` to +``OVModelForSequenceClassification``. + +.. code:: ipython3 + + from modelscope import snapshot_download + + text_model_id = "iic/nlp_bert_sentiment-analysis_english-base" + text_model_path = Path(text_model_id.split("/")[-1]) + ov_text_model_path = text_model_path / "ov" + + + if not text_model_path.exists(): + snapshot_download(text_model_id, local_dir=text_model_path) + +Select inference device for text classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + text_cls_device = device_widget("CPU", "NPU") + + text_cls_device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +Perform text classification +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from transformers import AutoTokenizer + from optimum.intel.openvino import OVModelForSequenceClassification + + + tokenizer = AutoTokenizer.from_pretrained(text_model_path) + + if not ov_text_model_path.exists(): + # model will be automatically exported to OpenVINO format during loading + ov_model = OVModelForSequenceClassification.from_pretrained(text_model_path, text_cls_device.value) + ov_model.save_pretrained(ov_text_model_path) + # save converted model using save_pretrained for avoid conversion in next time + tokenizer.save_pretrained(ov_text_model_path) + else: + # load converted model directly if availa ble + ov_model = OVModelForSequenceClassification.from_pretrained(ov_text_model_path, device=text_cls_device.value) + + # prepare input + input_text = "Good night." + input_data = tokenizer(input_text, return_tensors="pt") + + # run model inference + output = ov_model(**input_data) + # postprocess results + predicted_label_id = output.logits[0].argmax().item() + + predicted_label = ov_model.config.id2label[predicted_label_id] + + print(f"predicted label: {predicted_label}") + + +.. parsed-literal:: + + predicted label: Positive + + +Convert ModelScope models for usage with OpenVINO GenAI +------------------------------------------------------- + + + +OpenVINO™ GenAI is a library of the most popular Generative AI model +pipelines, optimized execution methods, and samples that run on top of +highly performant `OpenVINO +Runtime `__. + +This library is friendly to PC and laptop execution, and optimized for +resource consumption. It requires no external dependencies to run +generative models as it already includes all the core functionality +(e.g. tokenization via openvino-tokenizers). + +You can also load and run models from ModelScope with OpenVINO GenAI +`supported +pipelines `__. + +This inference approach is also based on model representation obtained +using Optimum Intel and also requires to download ModelScope model +first. As example we will be +`qwen2.5-1.5b-instruct `__ +model for text generation, that is part of powerful Qwen2 LLMs family. +If in previous chapter we are focused with usage python API for +downloading and converting models, in this one - we are also considering +CLI usage for the same actions. + +Downloading ModelScope models using CLI can be performed using following +command: + +.. code:: bash + + modelscope download --local_dir + +where ```` is model id from Hub and ```` is +output directory for model saving. + +``optimum-cli`` provides command line interface for exporting models +using Optimum. General OpenVINO export command format: + +.. code:: bash + + optimum-cli export openvino --model --task + +where task is task to export the model for. Available tasks depend on +the model, but are among: [‘default’, ‘fill-mask’, ‘text-generation’, +‘text2text-generation’, ‘text-classification’, ‘token-classification’, +‘multiple-choice’, ‘object-detection’, ‘question-answering’, +‘image-classification’, ‘image-segmentation’, ‘masked-im’, +‘semantic-segmentation’, ‘automatic-speech-recognition’, +‘audio-classification’, ‘audio-frame-classification’, +‘automatic-speech-recognition’, ‘audio-xvector’, ‘image-to-text’, +‘stable-diffusion’, ‘zero-shot-object-detection’]. + +You can find a mapping between tasks and model classes in Optimum +TaskManager +`documentation `__. + +Additionally, you can specify weights compression using +``--weight-format`` argument with one of following options: ``fp32``, +``fp16``, ``int8`` and ``int4``. Fro int8 and int4 nncf will be used for +weight compression. For models that required remote code execution, +``--trust-remote-code`` flag should be provided. + +Full list of supported arguments available via ``--help`` + +.. code:: ipython3 + + from IPython.display import Markdown, display + + model_id = "Qwen/Qwen2.5-1.5B-Instruct" + + llm_path = Path("Qwen2.5-1.5B-Instruct") + ov_llm_path = llm_path / "ov" + download_command = f"modelscope download {model_id} --local_dir {llm_path}" + display(Markdown("**Download command:**")) + display(Markdown(f"`{download_command}`")) + + if not llm_path.exists(): + !{download_command} + + + +**Download command:** + + + +``modelscope download Qwen/Qwen2.5-1.5B-Instruct --local_dir Qwen2.5-1.5B-Instruct`` + + +.. code:: ipython3 + + export_command = f"optimum-cli export openvino -m {llm_path} --task text-generation-with-past --weight-format int4 {ov_llm_path}" + display(Markdown("**Export command:**")) + display(Markdown(f"`{export_command}`")) + + if not ov_llm_path.exists(): + !{export_command} + + + +**Export command:** + + + +``optimum-cli export openvino -m Qwen2.5-1.5B-Instruct --task text-generation-with-past --weight-format int4 Qwen2.5-1.5B-Instruct/ov`` + + +Select inference device for text generation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + from notebook_utils import device_widget + + llm_device = device_widget("CPU") + + llm_device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU') + + + +Run OpenVINO GenAI pipeline +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +For running text generation using OpenVINO GenAI, we should use +``LLMPipeline`` class initialized with providing converted model +directory and inference device. You can find more detailed example how +to use OpenVINO GenAI ``LLMPipeline`` for chatbot scenario in this +`tutorial `__. + +.. code:: ipython3 + + import openvino_genai as ov_genai + + + def streamer(subword): + print(subword, end="", flush=True) + # Return flag corresponds whether generation should be stopped. + # False means continue generation. + return False + + + llm_pipe = ov_genai.LLMPipeline(ov_llm_path, llm_device.value) + + llm_pipe.generate("The Sun is yellow because", max_new_tokens=200, streamer=streamer) + + +.. parsed-literal:: + + it has a spectrum of colors, and you are also looking at it. What color would the sun be if you could see its light without being able to see any other objects? If we imagine that someone had never seen or heard about the sun before, what would they expect to see? + + 1. **Color of the Sun**: The sun appears yellow when viewed from Earth due to the way our atmosphere scatters sunlight. This phenomenon occurs as follows: + + - **Sunlight Scattering**: When sunlight passes through the Earth's atmosphere, different wavelengths (colors) of light travel at slightly different speeds due to their varying energies. + - **Air Mass Height**: At higher altitudes where air density decreases with altitude, shorter wavelength (blue) photons have more energy and thus escape faster into space compared to longer wavelength (red) photons which remain in the atmosphere longer. + - **Sky Color**: As a result, blue light is scattered more than red light by molecules in the upper layers of the atmosphere + + + +.. parsed-literal:: + + " it has a spectrum of colors, and you are also looking at it. What color would the sun be if you could see its light without being able to see any other objects? If we imagine that someone had never seen or heard about the sun before, what would they expect to see?\n\n1. **Color of the Sun**: The sun appears yellow when viewed from Earth due to the way our atmosphere scatters sunlight. This phenomenon occurs as follows:\n\n - **Sunlight Scattering**: When sunlight passes through the Earth's atmosphere, different wavelengths (colors) of light travel at slightly different speeds due to their varying energies.\n - **Air Mass Height**: At higher altitudes where air density decreases with altitude, shorter wavelength (blue) photons have more energy and thus escape faster into space compared to longer wavelength (red) photons which remain in the atmosphere longer.\n - **Sky Color**: As a result, blue light is scattered more than red light by molecules in the upper layers of the atmosphere" + + + +.. code:: ipython3 + + import gc + + del llm_pipe + gc.collect(); diff --git a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg new file mode 100644 index 00000000000000..97ae56df8a8721 --- /dev/null +++ b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1745fd9f64ac9914621f7eee3668e86daa8121bc83d1a2c7f27963c85026f104 +size 66633 diff --git a/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png new file mode 100644 index 00000000000000..d1c0d309736c1a --- /dev/null +++ b/docs/notebooks/modelscope-to-openvino-with-output_files/modelscope-to-openvino-with-output_12_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6235ab7dd2cb4318435320004320ffc6de773044c51cadcd581a7996faca313a +size 636558 diff --git a/docs/notebooks/music-generation-with-output.rst b/docs/notebooks/music-generation-with-output.rst index a5bdcbd8049318..2d63515872694f 100644 --- a/docs/notebooks/music-generation-with-output.rst +++ b/docs/notebooks/music-generation-with-output.rst @@ -124,8 +124,8 @@ Imports .. parsed-literal:: - 2024-11-22 01:43:50.913766: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:43:50.938403: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:28:39.145741: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:28:39.170431: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -165,7 +165,7 @@ generate a text-conditioned music sample. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor). self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False) Config of the text_encoder: is overwritten by shared text_encoder config: T5Config { "_name_or_path": "t5-base", @@ -346,7 +346,7 @@ vocabulary. It helps the model understand the context of a sentence. @@ -431,7 +431,7 @@ runtime .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. @@ -775,7 +775,7 @@ We can now infer the pipeline backed by OpenVINO models. diff --git a/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst b/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst index 0bac7af3f39c32..9cefe7216f2076 100644 --- a/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst +++ b/docs/notebooks/nano-llava-multimodal-chatbot-with-output.rst @@ -204,8 +204,8 @@ documentation 1 or self.sliding_window is not None) and self.is_causal: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/exporters/onnx/model_patcher.py:306: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/exporters/onnx/model_patcher.py:306: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/qnguyen3/nanoLLaVA/13d60cec183a86755afed64da495fcc2c382ea80/modeling_llava_qwen2.py:939: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if seq_len > self.max_seq_len_cached: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/qnguyen3/nanoLLaVA/13d60cec183a86755afed64da495fcc2c382ea80/modeling_llava_qwen2.py:1499: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): @@ -530,10 +530,10 @@ image encoder model. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/quantization/quantize_model.py:432: FutureWarning: `CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/quantization/quantize_model.py:432: FutureWarning: `CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead. warning_deprecated( - 2024-11-22 01:48:49.764790: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:48:49.789684: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:33:42.983675: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:33:43.008813: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -742,8 +742,7 @@ can use the same tokenizer and image processor that provided with model. Question: Describe this image in detail Answer: - This image features a cute, white lama, possibly a llama, which is depicted in a playful pose. The llama is surrounded by a fire, indicating it's being set on a burner. The flame appears to be a bright, bright yellow, and there are several tiny flames, possibly from the llama's actions. - The llama itself is quite detailed. It has a small brown nose and dark eyes that are expressive. The face of the llama is quite detailed as well, with a pair of ears that are also light brown. The llama's mouth is open, revealing its pink lips. There are also small pink spots on its face, + The image features a white, fluffy lamb with a big, bright smile, standing next to a fire. The lamb's face is detailed, with black eyes that are slightly squinty, and a mouth that's slightly open. It seems to be enjoying the heat from the fire, as it is seen looking down. The lamb's legs are also visible, and they appear to be furry. The lamb's tail is long and appears to be fluffy as well. The lamb's ears are also visible and are pink. The lamb's face is depicted in detail, with small black eyes and black nostrils. The lamb's nose is also Interactive demo diff --git a/docs/notebooks/notebooks_with_binder_buttons.txt b/docs/notebooks/notebooks_with_binder_buttons.txt index ce9cb50da47907..58f31aaae508c8 100644 --- a/docs/notebooks/notebooks_with_binder_buttons.txt +++ b/docs/notebooks/notebooks_with_binder_buttons.txt @@ -7,7 +7,6 @@ convert-to-openvino cross-lingual-books-alignment depth-anything detectron2-to-openvino -distilbert-sequence-classification fast-segment-anything handwritten-ocr hello-detection diff --git a/docs/notebooks/notebooks_with_colab_buttons.txt b/docs/notebooks/notebooks_with_colab_buttons.txt index 59b3348a4c90f7..2361fbe9a19c69 100644 --- a/docs/notebooks/notebooks_with_colab_buttons.txt +++ b/docs/notebooks/notebooks_with_colab_buttons.txt @@ -1,5 +1,4 @@ 3D-segmentation-point-clouds -amused-lightweight-text-to-image async-api auto-device clip-language-saliency-map @@ -8,7 +7,6 @@ cross-lingual-books-alignment depth-anything depth-anything-v2 detectron2-to-openvino -distilbert-sequence-classification explainable-ai-1-basic explainable-ai-2-deep-dive explainable-ai-3-map-interpretation diff --git a/docs/notebooks/object-detection-with-output.rst b/docs/notebooks/object-detection-with-output.rst index 5debc4e7ed88d4..fc055f6e7ae63e 100644 --- a/docs/notebooks/object-detection-with-output.rst +++ b/docs/notebooks/object-detection-with-output.rst @@ -84,7 +84,7 @@ Install requirements .. parsed-literal:: - 24717 + 24624 @@ -136,21 +136,21 @@ Download and convert the Model .. parsed-literal:: - 100%|██████████| 6.25M/6.25M [00:00<00:00, 26.9MB/s] + 100%|██████████| 6.25M/6.25M [00:00<00:00, 26.8MB/s] .. parsed-literal:: - Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.2.2+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) + Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.4.1+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) YOLOv8n summary (fused): 168 layers, 3,151,904 parameters, 0 gradients, 8.7 GFLOPs PyTorch: starting from 'yolov8n.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 84, 8400) (6.2 MB) OpenVINO: starting export with openvino 2024.4.0-16579-c3152d32c9c-releases/2024/4... - OpenVINO: export success ✅ 1.4s, saved as 'yolov8n_openvino_model/' (6.4 MB) + OpenVINO: export success ✅ 1.3s, saved as 'yolov8n_openvino_model/' (6.4 MB) - Export complete (1.6s) - Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/object-detection-webcam + Export complete (1.5s) + Results saved to /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/object-detection-webcam Predict: yolo predict task=detect model=yolov8n_openvino_model imgsz=640 half Validate: yolo val task=detect model=yolov8n_openvino_model imgsz=640 data=coco.yaml half Visualize: https://netron.app @@ -222,7 +222,7 @@ best performance. For that purpose, just use ``AUTO``. .. parsed-literal:: - Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.2.2+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) + Ultralytics 8.3.0 🚀 Python-3.8.10 torch-2.4.1+cpu CPU (Intel Core(TM) i9-10920X 3.50GHz) Loading yolov8n_openvino_model for OpenVINO inference... Using OpenVINO LATENCY mode for batch=1 inference... diff --git a/docs/notebooks/omniparser-with-output.rst b/docs/notebooks/omniparser-with-output.rst index 28676a03a84ba7..e22ce49105f78d 100644 --- a/docs/notebooks/omniparser-with-output.rst +++ b/docs/notebooks/omniparser-with-output.rst @@ -20,7 +20,6 @@ repo `__ and `model card `__. In this tutorial we consider how to run OmniParser using OpenVINO. - **Table of contents:** - `Prerequisites <#prerequisites>`__ @@ -72,9 +71,14 @@ Prerequisites .. code:: ipython3 - %pip install -q "torch>=2.1" easyocr torchvision accelerate "supervision==0.18.0" accelerate timm "einops==0.8.0" "ultralytics==8.1.24" pillow opencv-python "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu + import platform + + %pip install -q "torch>=2.1" easyocr torchvision accelerate "supervision==0.18.0" "transformers>=4.45" timm "einops==0.8.0" "ultralytics==8.1.24" pillow opencv-python "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu %pip install -q "openvino>=2024.4.0" + if platform.system() == "Darwin": + %pip install -q "numpy<2.0" + .. parsed-literal:: @@ -89,16 +93,21 @@ Prerequisites notebook_utils_path = Path("notebook_utils.py") florence_helper_path = Path("ov_florence2_helper.py") + omniparser_helper_path = Path("ov_omniparser_helper.py") if not notebook_utils_path.exists(): r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) - notebook_utils_path.open("w").write(r.text) + notebook_utils_path.open("w", encoding="utf-8").write(r.text) if not florence_helper_path.exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/florence2/ov_florence2_helper.py") - florence_helper_path.open("w").write(r.text) + florence_helper_path.open("w", encoding="utf-8").write(r.text) + + if not omniparser_helper_path.exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/omniparser/ov_omniparser_helper.py") + omniparser_helper_path.open("w", encoding="utf-8").write(r.text) Prepare models -------------- @@ -155,21 +164,21 @@ API. You can find more examples of this API usage in these .. parsed-literal:: - 2024-11-22 01:51:07.385705: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 01:51:07.410345: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:35:42.631431: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:35:42.657651: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. .. parsed-literal:: - weights/icon_detect/best.pt: 0%| | 0.00/11.7M [00:00=1.4.0, which is not installed. - mobileclip 0.1.0 requires torchvision==0.14.1, but you have torchvision 0.17.2+cpu which is incompatible. tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. tensorflow 2.12.0 requires tensorflow-estimator<2.13,>=2.12.0, but you have tensorflow-estimator 2.13.0 which is incompatible. tensorflow-cpu 2.13.1 requires numpy<=1.24.3,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow-cpu 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible. - torchvision 0.17.2+cpu requires torch==2.2.2, but you have torch 2.4.1 which is incompatible. Note: you may need to restart the kernel to use updated packages. @@ -250,9 +247,9 @@ True .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. WeightNorm.apply(module, name, dim) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device)) @@ -266,9 +263,9 @@ True .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/wavmark/__init__.py:16: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/wavmark/__init__.py:16: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint = torch.load(resume_path, map_location=torch.device('cpu')) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/api.py:36: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device)) @@ -418,38 +415,40 @@ documentation 0 - No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:283: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:283: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert ( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:346: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:346: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! pad_length = max(length - (self.window_size + 1), 0) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:347: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:347: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! slice_start_position = max((self.window_size + 1) - length, 0) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:349: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/attentions.py:349: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if pad_length > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:114: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:114: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if torch.min(inputs) < left or torch.max(inputs) > right: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:119: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:119: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if min_bin_width * num_bins > 1.0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:121: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:121: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if min_bin_height * num_bins > 1.0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:171: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/openvoice/OpenVoice/openvoice/transforms.py:171: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert (discriminant >= 0).all() - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: - %3293 : Float(1, 2, 43, strides=[86, 43, 1], requires_grad=0, device=cpu) = aten::randn(%3288, %3289, %3290, %3291, %3292) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 - %5559 : Float(1, 192, 153, strides=[29376, 1, 192], requires_grad=0, device=cpu) = aten::randn_like(%m_p, %5554, %5555, %5556, %5557, %5558) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: + %3293 : Float(1, 2, 43, strides=[86, 43, 1], requires_grad=0, device=cpu) = aten::randn(%3288, %3289, %3290, %3291, %3292) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 + %5559 : Float(1, 192, 150, strides=[28800, 1, 192], requires_grad=0, device=cpu) = aten::randn_like(%m_p, %5554, %5555, %5556, %5557, %5558) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86:0 This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace() _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 39680]) != torch.Size([1, 1, 38400]). - _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 2. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 155, 43]) != torch.Size([1, 1, 150, 43]). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: + Tensor-likes are not close! + + Mismatched elements: 38094 / 39424 (96.6%) + Greatest absolute difference: 0.7026380896568298 at index (0, 0, 4174) (up to 1e-05 allowed) + Greatest relative difference: 43899.56701030928 at index (0, 0, 2529) (up to 1e-05 allowed) _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 3. of the traced function does not match the corresponding output of the Python function. Detailed error: - The values for attribute 'shape' do not match: torch.Size([1, 1, 155]) != torch.Size([1, 1, 150]). + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 2. of the traced function does not match the corresponding output of the Python function. Detailed error: + Tensor-likes are not close! + + Mismatched elements: 42 / 6622 (0.6%) + Greatest absolute difference: 1.0 at index (0, 0, 7, 1) (up to 1e-05 allowed) + Greatest relative difference: inf at index (0, 0, 7, 2) (up to 1e-05 allowed) _check_trace( @@ -483,16 +482,16 @@ documentation )`. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:836.) + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py:1562: UserWarning: A window was not provided. A rectangular window will be applied,which is known to cause spectral leakage. Other windows such as torch.hann_window or torch.hamming_window can are recommended to reduce spectral leakage.To suppress this warning and use a rectangular window, explicitly set `window=torch.ones(n_fft, device=)`. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:836.) return forward_call(\*args, \*\*kwargs) @@ -720,7 +719,7 @@ Load speaker embeddings .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/functional.py:666: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/functional.py:666: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:873.) return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] @@ -875,7 +874,7 @@ And finally, run voice tone conversion with OpenVINO optimized model @@ -893,7 +892,7 @@ And finally, run voice tone conversion with OpenVINO optimized model @@ -1082,7 +1081,7 @@ voice tone conversion online. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/components/dropdown.py:100: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/components/dropdown.py:100: UserWarning: The `max_choices` parameter is ignored when `multiselect` is False. warnings.warn( diff --git a/docs/notebooks/optical-character-recognition-with-output.rst b/docs/notebooks/optical-character-recognition-with-output.rst index 7dae2290312e68..764bad414c61e9 100644 --- a/docs/notebooks/optical-character-recognition-with-output.rst +++ b/docs/notebooks/optical-character-recognition-with-output.rst @@ -131,13 +131,13 @@ again. .. parsed-literal:: - model/horizontal-text-detection-0001/FP16/horizontal-text-detection-0001.bin: 0%| | 0.00/3.70M [00:… + horizontal-text-detection-0001.bin: 0%| | 0.00/3.70M [00:00 + @@ -375,7 +375,7 @@ may be specified is input data .. parsed-literal:: - + @@ -413,7 +413,7 @@ then such conversion will be added explicitly. .. parsed-literal:: - + @@ -575,7 +575,7 @@ Compare results on one image .. parsed-literal:: - data/imagenet_2012.txt: 0%| | 0.00/30.9k [00:00= 3.10. Please make + sure that your environment fulfill to this requirement before running + it + +`OuteTTS-0.1-350M `__ is +a novel text-to-speech synthesis model that leverages pure language +modeling without external adapters or complex architectures, built upon +the LLaMa architecture. It demonstrates that high-quality speech +synthesis is achievable through a straightforward approach using crafted +prompts and audio tokens. + +More details about model can be found in `original +repo `__. + +In this tutorial we consider how to run OuteTTS pipeline using OpenVINO. + + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Convert model <#convert-model>`__ +- `Run model inference <#run-model-inference>`__ + + - `Text-to-Speech generation <#text-to-speech-generation>`__ + - `Text-to-Speech generation with Voice + Cloning <#text-to-speech-generation-with-voice-cloning>`__ + +- `Interactive demo <#interactive-demo>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +Prerequisites +------------- + + + +.. code:: ipython3 + + import platform + + %pip install -q "torch>=2.1" "torchaudio" "einops" "transformers>=4.46.1" "loguru" "inflect" "pesq" "torchcrepe" "natsort" "polars" uroman mecab-python3 unidic-lite --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q "gradio>=4.19" "openvino>=2024.4.0" "tqdm" "pyyaml" "librosa" "soundfile" + %pip install -q "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu + + if platform.system() == "Darwin": + %pip install -q "numpy<2.0.0" + +.. code:: ipython3 + + import requests + from pathlib import Path + + utility_files = ["cmd_helper.py", "notebook_utils.py"] + base_utility_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/" + + for utility_file in utility_files: + if not Path(utility_file).exists(): + r = requests.get(base_utility_url + utility_file) + with Path(utility_file).open("w") as f: + f.write(r.text) + + + helper_files = ["gradio_helper.py", "ov_outetts_helper.py"] + base_helper_url = "https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/outetts-text-to-speech" + + for helper_file in helper_files: + if not Path(helper_file).exists(): + r = requests.get(base_helper_url + helper_file) + with Path(helper_file).open("w") as f: + f.write(r.text) + +.. code:: ipython3 + + from cmd_helper import clone_repo + + repo_path = clone_repo("https://github.com/edwko/OuteTTS.git") + + interface_path = repo_path / "outetts/version/v1/interface.py" + + updated_version = interface_path.exists() + + if not updated_version: + interface_pth = repo_path / "outetts/v0_1/interface.py" + orig_interface_path = interface_path.parent / "_orig_interface.py" + + if not updated_version and not orig_interface_path.exists(): + interface_path.rename(orig_interface_path) + # sounddevice requires to install manually additional libraries, as we do not plan to use it for audio playing + # move it closer to its usage for avoid errors + with orig_interface_path.open("r") as in_file: + content = in_file.read() + upd_content = content.replace("import sounddevice as sd", "") + upd_content = upd_content.replace("sd.play", "import sounddevice as sd\n sd.play") + with interface_path.open("w") as out_file: + out_file.write(upd_content) + + %pip install -q {repo_path} --extra-index-url https://download.pytorch.org/whl/cpu + +Convert model +------------- + + + +OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate +Representation format. For convenience, we will use OpenVINO integration +with HuggingFace Optimum. `Optimum +Intel `__ is the +interface between the Transformers and Diffusers libraries and the +different tools and libraries provided by Intel to accelerate end-to-end +pipelines on Intel architectures. + +Among other use cases, Optimum Intel provides a simple interface to +optimize your Transformers and Diffusers models, convert them to the +OpenVINO Intermediate Representation (IR) format and run inference using +OpenVINO Runtime. ``optimum-cli`` provides command line interface for +model conversion and optimization. + +General command format: + +.. code:: bash + + optimum-cli export openvino --model --task + +where task is task to export the model for, if not specified, the task +will be auto-inferred based on the model. You can find a mapping between +tasks and model classes in Optimum TaskManager +`documentation `__. +Additionally, you can specify weights compression using +``--weight-format`` argument with one of following options: ``fp32``, +``fp16``, ``int8`` and ``int4``. Fro int8 and int4 +`nncf `__ will be used for +weight compression. More details about model export provided in `Optimum +Intel +documentation `__. + +As OuteTTS utilizes pure language modeling approach, model conversion +process remains the same like conversion LLaMa models family for text +generation purposes. + +.. code:: ipython3 + + from cmd_helper import optimum_cli + + model_id = "OuteAI/OuteTTS-0.1-350M" + model_dir = Path(model_id.split("/")[-1] + "-ov") + + if not model_dir.exists(): + optimum_cli(model_id, model_dir, additional_args={"task": "text-generation-with-past"}) + +Run model inference +------------------- + + + +OpenVINO integration with Optimum Intel provides ready-to-use API for +model inference that can be used for smooth integration with +transformers-based solutions. For loading model, we will use +``OVModelForCausalLM`` class that have compatible interface with +Transformers LLaMa implementation. For loading a model, +``from_pretrained`` method should be used. It accepts path to the model +directory or model_id from HuggingFace hub (if model is not converted to +OpenVINO format, conversion will be triggered automatically). +Additionally, we can provide an inference device, quantization config +(if model has not been quantized yet) and device-specific OpenVINO +Runtime configuration. More details about model inference with Optimum +Intel can be found in +`documentation `__. +We will use ``OVModelForCausalLM`` as replacement of original +``AutoModelForCausalLM`` in ``InterfaceHF``. + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget(exclude=["NPU"]) + + device + + + + +.. parsed-literal:: + + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') + + + +.. code:: ipython3 + + from ov_outetts_helper import InterfaceOV, OVHFModel # noqa: F401 + + # Uncomment these lines to see pipeline details + # ??InterfaceOV + # ??OVHFModel + + +.. parsed-literal:: + + 2024-11-29 11:48:51.975233: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-11-29 11:48:51.989550: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered + WARNING: All log messages before absl::InitializeLog() is called are written to STDERR + E0000 00:00:1732866532.005718 2314480 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered + E0000 00:00:1732866532.010517 2314480 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered + 2024-11-29 11:48:52.027376: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + + +.. code:: ipython3 + + interface = InterfaceOV(model_dir, device.value) + + +.. parsed-literal:: + + making attention of type 'vanilla' with 768 in_channels + + +Text-to-Speech generation +~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Now let’s see model in action. Providing input text to ``generate`` +method of interface, model returns tensor that represents output audio +with random speaker characteristics. + +.. code:: ipython3 + + output = interface.generate(text="Hello, I'm working!", temperature=0.1, repetition_penalty=1.1, max_length=4096) + + +.. parsed-literal:: + + The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + Setting `pad_token_id` to `eos_token_id`:None for open-end generation. + The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + + +.. code:: ipython3 + + import IPython.display as ipd + + ipd.Audio(output.audio[0].numpy(), rate=output.sr) + + + + +.. raw:: html + + + + + + + +Text-to-Speech generation with Voice Cloning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Additionally, we can specify reference voice for generation by providing +reference audio and transcript for it. ``interface.create_speaker`` +processes reference audio and text to set of features used for audio +description. + +.. code:: ipython3 + + from notebook_utils import download_file + + ref_audio_url = "https://huggingface.co/OuteAI/OuteTTS-0.1-350M/resolve/main/samples/2.wav" + + file_path = download_file(ref_audio_url) + + +.. parsed-literal:: + + '2.wav' already exists. + + +.. code:: ipython3 + + ipd.Audio(file_path) + + + + +.. raw:: html + + + + + + + +.. code:: ipython3 + + speaker = interface.create_speaker(file_path, "Hello, I can speak pretty well, but sometimes I make some mistakes.") + + # Save the speaker to a file + interface.save_speaker(speaker, "speaker.pkl") + + # Load the speaker from a file + speaker = interface.load_speaker("speaker.pkl") + + # Generate TTS with the custom voice + output = interface.generate(text="This is a cloned voice speaking", speaker=speaker, temperature=0.1, repetition_penalty=1.1, max_length=4096) + + +.. parsed-literal:: + + The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + Setting `pad_token_id` to `eos_token_id`:None for open-end generation. + + +.. code:: ipython3 + + ipd.Audio(output.audio[0].numpy(), rate=output.sr) + + + + +.. raw:: html + + + + + + + +Interactive demo +---------------- + + + +.. code:: ipython3 + + from gradio_helper import make_demo + + demo = make_demo(interface) + + try: + demo.launch(debug=True) + except Exception: + demo.launch(share=True, debug=True) diff --git a/docs/notebooks/paddle-ocr-webcam-with-output.rst b/docs/notebooks/paddle-ocr-webcam-with-output.rst index 3fae2e47d99b24..aa054a40e73a07 100644 --- a/docs/notebooks/paddle-ocr-webcam-with-output.rst +++ b/docs/notebooks/paddle-ocr-webcam-with-output.rst @@ -214,7 +214,7 @@ Download the Model for Text **Detection** .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-no… + ch_PP-OCRv3_det_infer.tar: 0%| | 0.00/3.65M [00:00 + @@ -439,7 +439,7 @@ Note that many optimizations are possible to improve the performance. .. parsed-literal:: - PaddlePaddle model on CPU: 0.0069 seconds per image, FPS: 144.32 + PaddlePaddle model on CPU: 0.0071 seconds per image, FPS: 141.67 PaddlePaddle result: Labrador retriever, 0.75138 @@ -500,7 +500,7 @@ select device from dropdown list for running inference using OpenVINO .. parsed-literal:: - OpenVINO IR model in OpenVINO Runtime (AUTO): 0.0026 seconds per image, FPS: 380.57 + OpenVINO IR model in OpenVINO Runtime (AUTO): 0.0027 seconds per image, FPS: 376.00 OpenVINO result: Labrador retriever, 0.74909 diff --git a/docs/notebooks/parler-tts-text-to-speech-with-output.rst b/docs/notebooks/parler-tts-text-to-speech-with-output.rst index 323959aa17e8ef..2be3c2a4a2c7ed 100644 --- a/docs/notebooks/parler-tts-text-to-speech-with-output.rst +++ b/docs/notebooks/parler-tts-text-to-speech-with-output.rst @@ -9,7 +9,7 @@ with synthetic annotations `__ by Dan Lyth and Simon King, from Stability AI and Edinburgh University respectively. -.. image:: https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?format=2500w +|image0| Text-to-speech models trained on large-scale datasets have demonstrated impressive in-context learning capabilities and naturalness. However, @@ -53,6 +53,8 @@ need a Jupyter server to start. For details, please refer to `Installation Guide `__. +.. |image0| image:: https://images.squarespace-cdn.com/content/v1/657816dfbefe0533e8a69d9a/30c96e25-acc5-4019-acdd-648da6142c4c/architecture_v3.png?format=2500w + Prerequisites ------------- @@ -64,8 +66,32 @@ Prerequisites os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" + %pip uninstall -q -y torch torchvision torchaudio %pip install -q "openvino>=2024.2.0" - %pip install -q git+https://github.com/huggingface/parler-tts.git "gradio>=4.19" transformers "torch>=2.2" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -q git+https://github.com/huggingface/parler-tts.git "gradio>=4.19" transformers "torch>=2.2" "torchaudio" --extra-index-url https://download.pytorch.org/whl/cpu + + +.. parsed-literal:: + + Note: you may need to restart the kernel to use updated packages. + Note: you may need to restart the kernel to use updated packages. + ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. + easyocr 1.7.2 requires torchvision>=0.5, which is not installed. + mobileclip 0.1.0 requires clip-benchmark>=1.4.0, which is not installed. + mobileclip 0.1.0 requires torchvision==0.14.1, which is not installed. + open-clip-torch 2.22.0 requires torchvision, which is not installed. + timm 1.0.12 requires torchvision, which is not installed. + ultralytics 8.1.24 requires torchvision>=0.9.0, which is not installed. + open-clip-torch 2.22.0 requires protobuf<4, but you have protobuf 4.25.5 which is incompatible. + tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. + tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. + tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. + tensorflow 2.12.0 requires tensorflow-estimator<2.13,>=2.12.0, but you have tensorflow-estimator 2.13.0 which is incompatible. + tensorflow-cpu 2.13.1 requires numpy<=1.24.3,>=1.22, but you have numpy 1.24.4 which is incompatible. + tensorflow-cpu 2.13.1 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.12.2 which is incompatible. + tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.25.5 which is incompatible. + Note: you may need to restart the kernel to use updated packages. + Load the original model and inference ------------------------------------- @@ -95,6 +121,135 @@ Load the original model and inference audio_arr = generation.cpu().numpy().squeeze() sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate) + +.. parsed-literal:: + + 2024-12-10 02:43:30.030324: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:43:30.055592: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. + Flash attention 2 is not installed + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/nn/utils/weight_norm.py:134: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`. + WeightNorm.apply(module, name, dim) + Config of the text_encoder: is overwritten by shared text_encoder config: T5Config { + "_name_or_path": "google/flan-t5-base", + "architectures": [ + "T5ForConditionalGeneration" + ], + "classifier_dropout": 0.0, + "d_ff": 2048, + "d_kv": 64, + "d_model": 768, + "decoder_start_token_id": 0, + "dense_act_fn": "gelu_new", + "dropout_rate": 0.1, + "eos_token_id": 1, + "feed_forward_proj": "gated-gelu", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "n_positions": 512, + "num_decoder_layers": 12, + "num_heads": 12, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_max_distance": 128, + "relative_attention_num_buckets": 32, + "task_specific_params": { + "summarization": { + "early_stopping": true, + "length_penalty": 2.0, + "max_length": 200, + "min_length": 30, + "no_repeat_ngram_size": 3, + "num_beams": 4, + "prefix": "summarize: " + }, + "translation_en_to_de": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to German: " + }, + "translation_en_to_fr": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to French: " + }, + "translation_en_to_ro": { + "early_stopping": true, + "max_length": 300, + "num_beams": 4, + "prefix": "translate English to Romanian: " + } + }, + "tie_word_embeddings": false, + "transformers_version": "4.46.1", + "use_cache": true, + "vocab_size": 32128 + } + + Config of the audio_encoder: is overwritten by shared audio_encoder config: DACConfig { + "_name_or_path": "ylacombe/dac_44khZ_8kbps", + "architectures": [ + "DACModel" + ], + "codebook_size": 1024, + "frame_rate": 86, + "latent_dim": 1024, + "model_bitrate": 8, + "model_type": "dac_on_the_hub", + "num_codebooks": 9, + "sampling_rate": 44100, + "torch_dtype": "float32", + "transformers_version": "4.46.1" + } + + Config of the decoder: is overwritten by shared decoder config: ParlerTTSDecoderConfig { + "_name_or_path": "/fsx/yoach/tmp/artefacts/decoder_400M/", + "activation_dropout": 0.0, + "activation_function": "gelu", + "add_cross_attention": true, + "architectures": [ + "ParlerTTSForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1025, + "codebook_weights": null, + "cross_attention_implementation_strategy": null, + "dropout": 0.1, + "eos_token_id": 1024, + "ffn_dim": 4096, + "hidden_size": 1024, + "initializer_factor": 0.02, + "is_decoder": true, + "layerdrop": 0.0, + "max_position_embeddings": 4096, + "model_type": "parler_tts_decoder", + "num_attention_heads": 16, + "num_codebooks": 9, + "num_cross_attention_key_value_heads": 16, + "num_hidden_layers": 24, + "num_key_value_heads": 16, + "pad_token_id": 1024, + "rope_embeddings": false, + "rope_theta": 10000.0, + "scale_embedding": false, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.46.1", + "use_cache": true, + "use_fused_lm_heads": false, + "vocab_size": 1088 + } + + You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers + The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. + + .. code:: ipython3 import IPython.display as ipd @@ -108,10 +263,10 @@ Load the original model and inference - + @@ -159,6 +314,20 @@ and Decoder (``ParlerTTSDecoder``). Lets convert them one by one. text_encoder_ov_model = convert(model.text_encoder, TEXT_ENCODER_OV_PATH, example_input) + +.. parsed-literal:: + + WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.base has been moved to tensorflow.python.trackable.base. The old module will be deleted in version 2.11. + + +.. parsed-literal:: + + [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + warnings.warn( + `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. + + The Decoder Model performs in generation pipeline and we can separate it into two stage. In the first stage the model generates ``past_key_values`` into output for the second stage. In the second @@ -193,6 +362,17 @@ stage the model produces tokens during several runs. decoder_1_ov_model = convert(DecoderStage1Wrapper(model.decoder.model.decoder), DECODER_STAGE_1_OV_PATH, example_input) + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/parler_tts/modeling_parler_tts.py:367: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if seq_len > self.weights.size(0): + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/parler_tts/modeling_parler_tts.py:1713: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if sequence_length != 1: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/parler_tts/modeling_parler_tts.py:916: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + + .. code:: ipython3 DECODER_STAGE_2_OV_PATH = Path("models/decoder_stage_2_ir.xml") @@ -231,6 +411,15 @@ stage the model produces tokens during several runs. decoder_2_ov_model = convert(DecoderStage2Wrapper(model.decoder.model.decoder), DECODER_STAGE_2_OV_PATH, example_input) + +.. parsed-literal:: + + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:458: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + or len(self.key_cache[layer_idx]) == 0 # the layer has no cache + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors + + Compiling models and inference ------------------------------ @@ -258,7 +447,7 @@ Select device from dropdown list for running inference using OpenVINO. .. parsed-literal:: - Dropdown(description='Device:', index=4, options=('CPU', 'GPU.0', 'GPU.1', 'GPU.2', 'AUTO'), value='AUTO') + Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO') @@ -360,10 +549,10 @@ and run inference. - + @@ -406,13 +595,27 @@ Interactive inference demo = make_demo(fn=infer) try: - demo.queue().launch(debug=True) + demo.queue().launch(debug=False) except Exception: - demo.queue().launch(share=True, debug=True) + demo.queue().launch(share=True, debug=False) # if you are launching remotely, specify server_name and server_port # demo.launch(server_name='your server name', server_port='server port in int') # Read more in the docs: https://gradio.app/docs/ + +.. parsed-literal:: + + Running on local URL: http://127.0.0.1:7860 + + To create a public link, set `share=True` in `launch()`. + + + + + + + + .. code:: ipython3 # please uncomment and run this cell for stopping gradio interface diff --git a/docs/notebooks/person-tracking-with-output.rst b/docs/notebooks/person-tracking-with-output.rst index 653a9b376edf7e..6ac8ff43e05ab2 100644 --- a/docs/notebooks/person-tracking-with-output.rst +++ b/docs/notebooks/person-tracking-with-output.rst @@ -148,7 +148,7 @@ Imports import collections from pathlib import Path import time - + import numpy as np import cv2 from IPython import display @@ -158,17 +158,17 @@ Imports .. code:: ipython3 # Import local modules - + if not Path("./notebook_utils.py").exists(): # Fetch `notebook_utils` module import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) - + open("notebook_utils.py", "w").write(r.text) - + import notebook_utils as utils from deepsort_utils.tracker import Tracker from deepsort_utils.nn_matching import NearestNeighborDistanceMetric @@ -200,36 +200,36 @@ by the cosine distance. .. code:: ipython3 from notebook_utils import download_ir_model - + # A directory where the model will be downloaded. base_model_dir = "model" precision = "FP16" # The name of the model from Open Model Zoo detection_model_name = "person-detection-0202" - - + + download_det_model_url = ( f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{detection_model_name}/{precision}/{detection_model_name}.xml" ) - + detection_model_path = download_ir_model(download_det_model_url, Path(base_model_dir) / detection_model_name / precision) - + reidentification_model_name = "person-reidentification-retail-0287" download_reid_model_url = f"https://storage.openvinotoolkit.org/repositories/open_model_zoo/2023.0/models_bin/1/{reidentification_model_name}/{precision}/{reidentification_model_name}.xml" - + reidentification_model_path = download_ir_model(download_reid_model_url, Path(base_model_dir) / reidentification_model_name / precision) .. parsed-literal:: - model/person-detection-0202/FP16/person-detection-0202.bin: 0%| | 0.00/3.47M [00:00 200: processing_times.popleft() - + _, f_width = frame.shape[:2] # Mean processing time [ms]. processing_time = np.mean(processing_times) * 1100 fps = 1000 / processing_time - + # Get poses from detection results. bbox_xywh, score, label = process_results(h, w, results=output) - + img_crops = [] for box in bbox_xywh: x1, y1, x2, y2 = xywh_to_xyxy(box, h, w) img = frame[y1:y2, x1:x2] img_crops.append(img) - + # Get reidentification feature of each person. if img_crops: # preprocess @@ -615,17 +614,17 @@ video file. features = extractor.predict(img_batch) else: features = np.array([]) - + # Wrap the detection and reidentification results together bbox_tlwh = xywh_to_tlwh(bbox_xywh) detections = [Detection(bbox_tlwh[i], features[i]) for i in range(features.shape[0])] - + # predict the position of tracking target tracker.predict() - + # update tracker tracker.update(detections) - + # update bbox identities outputs = [] for track in tracker.tracks: @@ -637,14 +636,14 @@ video file. outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int32)) if len(outputs) > 0: outputs = np.stack(outputs, axis=0) - + # draw box for visualization if len(outputs) > 0: bbox_tlwh = [] bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] frame = draw_boxes(frame, bbox_xyxy, identities) - + cv2.putText( img=frame, text=f"Inference time: {processing_time:.1f}ms ({fps:.1f} FPS)", @@ -655,7 +654,7 @@ video file. thickness=1, lineType=cv2.LINE_AA, ) - + if use_popup: cv2.imshow(winname=title, mat=frame) key = cv2.waitKey(1) @@ -670,7 +669,7 @@ video file. # Display the image in this notebook. display.clear_output(wait=True) display.display(i) - + # ctrl-c except KeyboardInterrupt: print("Interrupted") @@ -724,11 +723,11 @@ will work. .. code:: ipython3 USE_WEBCAM = False - + cam_id = 0 video_file = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/people.mp4" source = cam_id if USE_WEBCAM else video_file - + run_person_tracking(source=source, flip=USE_WEBCAM, use_popup=False) diff --git a/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png b/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png index f827c9c1094e46..972cc9e5977684 100644 --- a/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png +++ b/docs/notebooks/person-tracking-with-output_files/person-tracking-with-output_25_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dffde5665ae619cc99fddef72befb32d1002becce56dfccf50e7577f1fab020 -size 218904 +oid sha256:1c04ed0e53cb210bd7853d3daa7f77a0a087b8e08099b837d3237b025c223b5d +size 218593 diff --git a/docs/notebooks/phi-3-vision-with-output.rst b/docs/notebooks/phi-3-vision-with-output.rst index 71981daac13be4..dc588206768c93 100644 --- a/docs/notebooks/phi-3-vision-with-output.rst +++ b/docs/notebooks/phi-3-vision-with-output.rst @@ -260,8 +260,8 @@ documentation 1 or self.sliding_window is not None) and self.is_causal: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:444: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! seq_len = seq_len or torch.max(position_ids) + 1 /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if seq_len > self.original_max_position_embeddings: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/nncf/torch/dynamic_graph/wrappers.py:86: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect. op1 = operator(\*args, \*\*kwargs) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors /opt/home/k8sworker/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3.5-vision-instruct/4a0d683eba9f1d0cbfb6151705d1ee73c25a80ca/modeling_phi3_v.py:683: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): @@ -374,7 +365,7 @@ documentation =4.0.0, but you have protobuf 3.20.3 which is incompatible. + parler-tts 0.2.2 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. @@ -210,8 +210,8 @@ PhotoMaker to generate the original PhotoMaker pipeline. .. parsed-literal:: - 2024-11-22 02:03:50.933677: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 02:03:50.958255: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:49:18.726948: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:49:18.751780: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -230,6 +230,12 @@ PhotoMaker to generate the original PhotoMaker pipeline. Loading pipeline components...: 0%| | 0/7 [00:00 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: @@ -587,15 +584,15 @@ original Stable Diffusion XL model. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1103: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if dim % default_overall_up_factor != 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:146: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:146: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if hidden_states.shape[0] >= 64: diff --git a/docs/notebooks/pixart-with-output.rst b/docs/notebooks/pixart-with-output.rst index 517191e17501ef..fed1f6b3dada41 100644 --- a/docs/notebooks/pixart-with-output.rst +++ b/docs/notebooks/pixart-with-output.rst @@ -118,8 +118,8 @@ directly in latent space, achieving super fast inference with few steps. .. parsed-literal:: - 2024-11-22 02:11:50.540718: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 02:11:50.565755: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 02:57:23.724286: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 02:57:23.749610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -132,6 +132,8 @@ directly in latent space, achieving super fast inference with few steps. .. parsed-literal:: You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 + Some weights of the model checkpoint were not used when initializing PixArtTransformer2DModel: + ['caption_projection.y_embedding'] @@ -140,12 +142,6 @@ directly in latent space, achieving super fast inference with few steps. Loading checkpoint shards: 0%| | 0/4 [00:00= 64: @@ -452,7 +448,7 @@ And insert wrappers instances in the pipeline: .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) @@ -567,7 +563,7 @@ To collect intermediate model inputs for calibration we should customize .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/configuration_utils.py:140: FutureWarning: Accessing config attribute `_execution_device` directly via 'PixArtAlphaPipeline' object attribute is deprecated. Please access '_execution_device' over 'PixArtAlphaPipeline's config object instead, e.g. 'scheduler.config._execution_device'. deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) @@ -1625,16 +1621,16 @@ pipelines. Loading pipeline components...: 0%| | 0/5 [00:00 0.19 and openvino version <= 2024.4. Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO. - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 6% (1 / 281) │ 0% (0 / 280) │ - ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ 4 │ 94% (280 / 281) │ 100% (280 / 280) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:05:31 • 0:00:00 - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 6% (3 / 172) │ 0% (0 / 169) │ - ├────────────────┼─────────────────────────────┼────────────────────────────────────────┤ - │ 4 │ 94% (169 / 172) │ 100% (169 / 169) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:12 • 0:00:00 - INFO:nncf:Statistics of the bitwidth distribution: - ┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑ - │ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │ - ┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥ - │ 8 │ 100% (1 / 1) │ 0% (0 / 0) │ - ┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙ - Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:05 • 0:00:00 - + Traceback (most recent call last): + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/bin/optimum-cli", line 10, in + sys.exit(main()) + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/commands/optimum_cli.py", line 208, in main + service.run() + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/commands/export/openvino.py", line 390, in run + main_export( + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/exporters/openvino/__main__.py", line 476, in main_export + _weight_only_quantization(submodel, quantization_config) + File "/opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/optimum/intel/openvino/quantization.py", line 938, in _weight_only_quantization + return nncf.compress_weights( + TypeError: compress_weights() got an unexpected keyword argument 'backup_mode' + Run model inference ------------------- @@ -541,8 +528,8 @@ Intel can be found in .. parsed-literal:: - 2024-11-22 03:06:17.214277: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 03:06:17.240005: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 03:48:41.700649: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 03:48:41.726260: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -587,7 +574,7 @@ Intel can be found in .. parsed-literal:: - The unusual aspect of this image is that the cat is lying inside a cardboard box, which is not a typical setting for a cat. Cats are often known for their affinity for boxes, but it is still considered unusual to see a cat comfortably resting inside a box in a living room setting. The cat appears relaxed and content, which adds to the charm of the scene. The presence of a sofa in the background further emphasizes the domestic and cozy atmosphere of the image. + The unusual aspect of this image is that the cat is lying on its back inside a cardboard box. This is not a typical position for a cat, as they usually prefer to curl up or lie on their sides when resting. Additionally, cats are known for their love of small, enclosed spaces, but it is less common to see a cat lying on its back in such a setting. The image captures a playful and relaxed moment, highlighting the cat's comfort and curiosity. Interactive demo diff --git a/docs/notebooks/pose-estimation-with-output.rst b/docs/notebooks/pose-estimation-with-output.rst index e827bd19acfd34..112b6037d4907f 100644 --- a/docs/notebooks/pose-estimation-with-output.rst +++ b/docs/notebooks/pose-estimation-with-output.rst @@ -126,13 +126,13 @@ precision in the code below. .. parsed-literal:: - model/intel/human-pose-estimation-0001/FP16-INT8/human-pose-estimation-0001.xml: 0%| | 0.00/474k [0… + human-pose-estimation-0001.xml: 0%| | 0.00/474k [00:00 target_length: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors diff --git a/docs/notebooks/qwen2-vl-with-output.rst b/docs/notebooks/qwen2-vl-with-output.rst index d9c51a151e5926..ea0541fe1610a7 100644 --- a/docs/notebooks/qwen2-vl-with-output.rst +++ b/docs/notebooks/qwen2-vl-with-output.rst @@ -55,10 +55,8 @@ In this tutorial we consider how to convert and optimize Qwen2VL model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using -`NNCF `__ - - -**Table of contents:** +`NNCF `__ #### Table of +contents: - `Prerequisites <#prerequisites>`__ - `Select model <#select-model>`__ @@ -106,11 +104,11 @@ Prerequisites from pathlib import Path import requests - + if not Path("ov_qwen2_vl.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/ov_qwen2_vl.py") open("ov_qwen2_vl.py", "w").write(r.text) - + if not Path("notebook_utils.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") open("notebook_utils.py", "w").write(r.text) @@ -128,9 +126,9 @@ using widget bellow: .. code:: ipython3 from ov_qwen2_vl import model_selector - + model_id = model_selector() - + model_id @@ -141,8 +139,8 @@ using widget bellow: .. parsed-literal:: - 2024-11-22 04:16:41.832996: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:16:41.858520: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:00:06.245590: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:00:06.272261: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -287,20 +285,20 @@ documentation target_length: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/cache_utils.py:443: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results. elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors @@ -442,7 +431,7 @@ Intel `__ .. code:: ipython3 from ov_qwen2_vl import OVQwen2VLModel - + # Uncomment below lines to see the model inference class code # OVQwen2VLModel?? @@ -454,9 +443,9 @@ Select inference device .. code:: ipython3 from notebook_utils import device_widget - + device = device_widget(default="AUTO", exclude=["NPU"]) - + device @@ -483,25 +472,25 @@ Run model inference from transformers import AutoProcessor, AutoTokenizer from qwen_vl_utils import process_vision_info from transformers import TextStreamer - - + + min_pixels = 256 * 28 * 28 max_pixels = 1280 * 28 * 28 processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels) - + if processor.chat_template is None: tok = AutoTokenizer.from_pretrained(model_dir) processor.chat_template = tok.chat_template - + example_image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" example_image_path = Path("demo.jpeg") - + if not example_image_path.exists(): Image.open(requests.get(example_image_url, stream=True).raw).save(example_image_path) - + image = Image.open(example_image_path) question = "Describe this image." - + messages = [ { "role": "user", @@ -514,7 +503,7 @@ Run model inference ], } ] - + # Preparation for inference text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) @@ -525,12 +514,12 @@ Run model inference padding=True, return_tensors="pt", ) - + display(image) print("Question:") print(question) print("Answer:") - + generated_ids = model.generate(**inputs, max_new_tokens=100, streamer=TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)) @@ -573,10 +562,10 @@ click ``Submit`` to start communication. .. code:: ipython3 from gradio_helper import make_demo - - + + demo = make_demo(model, processor) - + try: demo.launch(debug=False) except Exception: @@ -589,9 +578,9 @@ click ``Submit`` to start communication. .. parsed-literal:: Running on local URL: http://127.0.0.1:7860 - + Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB - + To create a public link, set `share=True` in `launch()`. diff --git a/docs/notebooks/rmbg-background-removal-with-output.rst b/docs/notebooks/rmbg-background-removal-with-output.rst index c2e7286cc35cb4..0961afb2bf1ef5 100644 --- a/docs/notebooks/rmbg-background-removal-with-output.rst +++ b/docs/notebooks/rmbg-background-removal-with-output.rst @@ -112,8 +112,8 @@ it may take some time. .. parsed-literal:: - 2024-11-22 04:19:11.305790: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:19:11.330949: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:02:42.657474: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:02:42.682685: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -240,7 +240,7 @@ function or directly loading on device using ``core.complie_model``. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. diff --git a/docs/notebooks/segment-anything-2-image-with-output.rst b/docs/notebooks/segment-anything-2-image-with-output.rst index 1e938df4a9763a..d9b24bf720325b 100644 --- a/docs/notebooks/segment-anything-2-image-with-output.rst +++ b/docs/notebooks/segment-anything-2-image-with-output.rst @@ -120,24 +120,20 @@ Prerequisites .. parsed-literal:: - ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. - mobileclip 0.1.0 requires clip-benchmark>=1.4.0, which is not installed. - mobileclip 0.1.0 requires torchvision==0.14.1, but you have torchvision 0.19.1+cpu which is incompatible. - parler-tts 0.2.1 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. Note: you may need to restart the kernel to use updated packages. Collecting iopath>=0.1.10 Using cached iopath-0.1.10-py3-none-any.whl - Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) - Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) - Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.0) - Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) - Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) - Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) - Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) - Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) - Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) - Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) + Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) + Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) + Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.1) + Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) + Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) + Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) + Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) + Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) + Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) + Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) Installing collected packages: iopath Attempting uninstall: iopath Found existing installation: iopath 0.1.9 @@ -190,10 +186,10 @@ Clone and install segment-anything-2 .. parsed-literal:: env: SAM2_BUILD_CUDA=0 - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation/sam2 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation/sam2 ERROR: Package 'sam-2' requires a different Python: 3.8.10 not in '>=3.10.0' Note: you may need to restart the kernel to use updated packages. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-image-segmentation .. code:: ipython3 @@ -397,7 +393,8 @@ Mask prediction will be includes two models: * **Prompt Encoder** - Encoder for segmentation condition. As a condition can be used points, boxes or segmentation mask. -* **Mask Decoder** - The mask decoder efficiently maps the image embedding, prompt embeddings, and an output +* **Mask Decoder** - The mask decoder + efficiently maps the image embedding, prompt embeddings, and an output token to a mask. Combined prompt encoder and mask decoder model has following list of @@ -488,12 +485,6 @@ Example Image image = cv2.imread("truck.jpg") image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - -.. parsed-literal:: - - 'truck.jpg' already exists. - - .. code:: ipython3 plt.figure(figsize=(10, 10)) diff --git a/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png new file mode 100644 index 00000000000000..343e5ecc49fc50 --- /dev/null +++ b/docs/notebooks/segment-anything-2-image-with-output_files/segment-anything-2-image-with-output_92_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:520c7390de98864c4ae6b24b940230e83f2b5fc0b1723d58ed9941cc2d9bc70f +size 469439 diff --git a/docs/notebooks/segment-anything-2-video-with-output.rst b/docs/notebooks/segment-anything-2-video-with-output.rst index 20aae9f8a5e3f9..dec5f3d63f341e 100644 --- a/docs/notebooks/segment-anything-2-video-with-output.rst +++ b/docs/notebooks/segment-anything-2-video-with-output.rst @@ -110,18 +110,18 @@ Prerequisites .. parsed-literal:: Note: you may need to restart the kernel to use updated packages. - Requirement already satisfied: iopath>=0.1.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.1.10) - Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) - Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) - Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.0) - Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) - Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) - Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) - Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) - Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) - Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) - Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) + Requirement already satisfied: iopath>=0.1.10 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (0.1.10) + Requirement already satisfied: pillow>=9.4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (10.4.0) + Requirement already satisfied: hydra-core>=1.3.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.3.2) + Requirement already satisfied: tqdm in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.67.1) + Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (4.12.2) + Requirement already satisfied: portalocker in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from iopath>=0.1.10) (3.0.0) + Requirement already satisfied: omegaconf<2.4,>=2.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (2.3.0) + Requirement already satisfied: antlr4-python3-runtime==4.9.* in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (4.9.3) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (24.2) + Requirement already satisfied: importlib-resources in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from hydra-core>=1.3.2) (6.4.5) + Requirement already satisfied: PyYAML>=5.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from omegaconf<2.4,>=2.2->hydra-core>=1.3.2) (6.0.2) + Requirement already satisfied: zipp>=3.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from importlib-resources->hydra-core>=1.3.2) (3.20.2) Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -174,7 +174,7 @@ Clone and install segment-anything-2 .. parsed-literal:: env: SAM2_BUILD_CUDA=0 - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation/sam2 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation/sam2 .. parsed-literal:: @@ -203,7 +203,7 @@ Clone and install segment-anything-2 ERROR: Package 'sam-2' requires a different Python: 3.8.10 not in '>=3.10.0' Note: you may need to restart the kernel to use updated packages. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sam2-video-segmentation .. code:: ipython3 @@ -601,14 +601,14 @@ Prepare data .. parsed-literal:: - data/coco.mp4: 0%| | 0.00/877k [00:00 - + Your browser does not support the video tag. @@ -840,7 +840,7 @@ Example with box .. parsed-literal:: - frame loading (JPEG): 100%|██████████| 50/50 [00:00<00:00, 52.72it/s] + frame loading (JPEG): 100%|██████████| 25/25 [00:00<00:00, 54.66it/s] @@ -877,7 +877,7 @@ Example with box .. parsed-literal:: - propagate in video: 100%|██████████| 50/50 [07:47<00:00, 9.35s/it] + propagate in video: 100%|██████████| 25/25 [03:37<00:00, 8.71s/it] .. code:: ipython3 @@ -894,7 +894,7 @@ Example with box .. raw:: html @@ -927,7 +927,7 @@ Run Interactive For Video Segmentation with Gradio .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.40.0, however version 4.44.1 is available, please upgrade. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.40.0, however version 4.44.1 is available, please upgrade. -------- warnings.warn( diff --git a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png index 5721f78113b9a5..8b2efbd6f030df 100644 --- a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png +++ b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_40_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dce79554325cf25434872511d2c96b1361ab4a3e14f23a936d227177ee98836f -size 193590 +oid sha256:bdf6f36d230ce5b74e070f0abb2e3672a1ae3f31094c2444a0e0623b95f1bf35 +size 193591 diff --git a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png index e18f213004313f..65df892bd6e8c2 100644 --- a/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png +++ b/docs/notebooks/segment-anything-2-video-with-output_files/segment-anything-2-video-with-output_46_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00bfe0191dd680f768ea740c00fc4e6d9054e72c250971fc8e12807159a26644 -size 190195 +oid sha256:6b3a974acb951d94d941f150b640a1dcce172f6974085774adbf06e22adeb386 +size 190202 diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst b/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst index a38b7c56a2ec8a..a1738642568a2b 100644 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst +++ b/docs/notebooks/siglip-zero-shot-image-classification-with-output.rst @@ -120,8 +120,8 @@ tokenizer and preparing the images. .. parsed-literal:: - 2024-11-22 04:41:05.723109: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:41:05.748466: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:15:56.596890: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:15:56.621776: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -258,7 +258,7 @@ object ready to load on the device and start making predictions. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. @@ -604,7 +604,7 @@ model are similar to the PyTorch model. .. parsed-literal:: - [{'dog': 0.99}, {'horse': 0.0}, {'cat': 0.0}, {'wolf': 0.0}, {'tiger': 0.0}] + [{'dog': 0.99}, {'horse': 0.0}, {'cat': 0.0}, {'wolf': 0.0}, {'frog': 0.0}] @@ -679,7 +679,7 @@ approximately estimate the speed up of the dynamic quantized models. .. parsed-literal:: - Performance speed up: 2.016 + Performance speed up: 1.907 Interactive inference diff --git a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png b/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png index 611278a49d1583..6e5afc5acf92a6 100644 --- a/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png +++ b/docs/notebooks/siglip-zero-shot-image-classification-with-output_files/siglip-zero-shot-image-classification-with-output_24_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f15546e58fac808ed62a6fcc29f2b58b48a974070a9d8c0b5c199c533b747d0 -size 580998 +oid sha256:3ebc30e695ed16710b909a552137d214ca9defb109984e4da59e8b684ce59427 +size 581000 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst index e9fcfb3f8baa9f..a5b31e15d97ec2 100644 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst +++ b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output.rst @@ -61,8 +61,8 @@ and install required packages. ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. modelscope-studio 0.5.2 requires gradio<6.0,>=4.0, but you have gradio 3.43.1 which is incompatible. - parler-tts 0.2.1 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. - parler-tts 0.2.1 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. + parler-tts 0.2.2 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. + parler-tts 0.2.2 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. Note: you may need to restart the kernel to use updated packages. @@ -121,7 +121,7 @@ and install required packages. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/img2img-turbo + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/img2img-turbo Load PyTorch model @@ -381,10 +381,10 @@ diagram indicate trainable layers. Semi-transparent layers are frozen. .. parsed-literal:: - 2024-11-22 04:46:27.445712: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 04:46:27.471919: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:21:48.209793: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:21:48.234621: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/outputs.py:63: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. torch.utils._pytree._register_pytree_node( @@ -402,7 +402,7 @@ diagram indicate trainable layers. Semi-transparent layers are frozen. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/huggingface_hub/file_download.py:1142: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn( @@ -413,8 +413,8 @@ diagram indicate trainable layers. Semi-transparent layers are frozen. .. parsed-literal:: - 100%|██████████| 525M/525M [18:17<00:00, 478kiB/s] - /tmp/ipykernel_3576883/2531017353.py:172: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + 100%|██████████| 525M/525M [07:34<00:00, 1.15MiB/s] + /tmp/ipykernel_2241734/2531017353.py:172: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. sd = torch.load(p_ckpt, map_location="cpu") @@ -473,30 +473,30 @@ on disk using ``ov.save_model`` in compressed to FP16 format. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if input_shape[-1] > 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:135: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:135: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:144: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:144: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unet_2d_condition.py:915: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unet_2d_condition.py:915: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if dim % default_overall_up_factor != 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:149: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:149: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:165: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:165: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if hidden_states.shape[0] >= 64: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:433: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:433: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:440: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:440: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:479: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:479: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if t > 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:330: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/schedulers/scheduling_ddpm.py:330: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one @@ -676,17 +676,17 @@ Download results using download button .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/gradio_helper.py:225: GradioDeprecationWarning: 'scale' value should be an integer. Using 0.4 will cause issues. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo/gradio_helper.py:225: GradioDeprecationWarning: 'scale' value should be an integer. Using 0.4 will cause issues. with gr.Column(elem_id="column_process", min_width=50, scale=0.4): - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:776: UserWarning: Expected 1 arguments for function . at 0x7f22fbf5a550>, received 0. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:776: UserWarning: Expected 1 arguments for function . at 0x7fafe0603c10>, received 0. warnings.warn( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:780: UserWarning: Expected at least 1 arguments for function . at 0x7f22fbf5a550>, received 0. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/gradio/utils.py:780: UserWarning: Expected at least 1 arguments for function . at 0x7fafe0603c10>, received 0. warnings.warn( .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/sketch-to-image-pix2pix-turbo Running on local URL: http://127.0.0.1:7860 To create a public link, set `share=True` in `launch()`. diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg index 9d8436d4e8894f..a054eb11c32455 100644 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg +++ b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c28978a7a34769c09d64918686fb69f0239eb9f6499e590a86af16ca1a416d4 -size 23636 +oid sha256:7f92cbd6bb14242b47d354389a04e3413c94c46d233e71b73e305bfb73085a10 +size 23649 diff --git a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png index cacdc0c183ea23..336a9ae38fa096 100644 --- a/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png +++ b/docs/notebooks/sketch-to-image-pix2pix-turbo-with-output_files/sketch-to-image-pix2pix-turbo-with-output_19_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5acfcf35473541de444c0a2edbfec36423f37335fecf6844a179c65530a6b54 -size 303319 +oid sha256:179009716266de8c220bfe9b7b3d64410061f8ae8bf74a08305655c020cde76f +size 303164 diff --git a/docs/notebooks/sparsity-optimization-with-output.rst b/docs/notebooks/sparsity-optimization-with-output.rst index 8d3779621fb2ec..038a8db6aec1b1 100644 --- a/docs/notebooks/sparsity-optimization-with-output.rst +++ b/docs/notebooks/sparsity-optimization-with-output.rst @@ -82,8 +82,8 @@ Imports .. parsed-literal:: - 2024-11-22 05:06:26.947305: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:06:26.972806: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:31:08.167081: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:31:08.192294: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -202,7 +202,7 @@ as an example. It is recommended to tune based on your applications. [ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 68.94 ms + [ INFO ] Read model took 72.79 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [?,?] @@ -213,7 +213,7 @@ as an example. It is recommended to tune based on your applications. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: 'input_ids': [1,64], 'attention_mask': [1,64], 'token_type_ids': [1,64] - [ INFO ] Reshape model took 28.06 ms + [ INFO ] Reshape model took 27.96 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,64] @@ -222,7 +222,7 @@ as an example. It is recommended to tune based on your applications. [ INFO ] Model outputs: [ INFO ] logits (node: logits) : f32 / [...] / [1,2] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 999.63 ms + [ INFO ] Compile model took 1082.12 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: torch_jit @@ -254,17 +254,17 @@ as an example. It is recommended to tune based on your applications. [ INFO ] Fill input 'token_type_ids' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests, limits: 60000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 27.20 ms + [ INFO ] First inference took 28.08 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] [ INFO ] Count: 9176 iterations - [ INFO ] Duration: 60047.45 ms + [ INFO ] Duration: 60033.51 ms [ INFO ] Latency: [ INFO ] Median: 25.83 ms - [ INFO ] Average: 25.91 ms - [ INFO ] Min: 24.30 ms - [ INFO ] Max: 37.67 ms - [ INFO ] Throughput: 152.81 FPS + [ INFO ] Average: 25.92 ms + [ INFO ] Min: 23.43 ms + [ INFO ] Max: 42.58 ms + [ INFO ] Throughput: 152.85 FPS Benchmark quantized sparse inference performance @@ -321,7 +321,7 @@ for which a layer will be enabled. [ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 71.97 ms + [ INFO ] Read model took 75.90 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [?,?] @@ -332,7 +332,7 @@ for which a layer will be enabled. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: 'input_ids': [1,64], 'attention_mask': [1,64], 'token_type_ids': [1,64] - [ INFO ] Reshape model took 28.33 ms + [ INFO ] Reshape model took 28.30 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_ids (node: input_ids) : i64 / [...] / [1,64] @@ -341,7 +341,7 @@ for which a layer will be enabled. [ INFO ] Model outputs: [ INFO ] logits (node: logits) : f32 / [...] / [1,2] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1001.30 ms + [ INFO ] Compile model took 1011.04 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: torch_jit @@ -373,17 +373,17 @@ for which a layer will be enabled. [ INFO ] Fill input 'token_type_ids' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests, limits: 60000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 28.02 ms + [ INFO ] First inference took 27.34 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 9216 iterations - [ INFO ] Duration: 60030.33 ms + [ INFO ] Count: 9152 iterations + [ INFO ] Duration: 60027.24 ms [ INFO ] Latency: - [ INFO ] Median: 25.92 ms - [ INFO ] Average: 25.94 ms - [ INFO ] Min: 23.04 ms - [ INFO ] Max: 31.17 ms - [ INFO ] Throughput: 153.52 FPS + [ INFO ] Median: 25.91 ms + [ INFO ] Average: 25.97 ms + [ INFO ] Min: 23.89 ms + [ INFO ] Max: 41.37 ms + [ INFO ] Throughput: 152.46 FPS When this might be helpful diff --git a/docs/notebooks/speculative-sampling-with-output.rst b/docs/notebooks/speculative-sampling-with-output.rst index 4d5656cb99645c..868fbe9beccf9e 100644 --- a/docs/notebooks/speculative-sampling-with-output.rst +++ b/docs/notebooks/speculative-sampling-with-output.rst @@ -214,7 +214,23 @@ generation is finished, we will write streamer function. pipe = ov_genai.LLMPipeline(target_model_path, device.value) config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 + config.max_new_tokens = 330 + prompt = ''' + + def prime_fib(n: int): + """ + prime_fib returns n-th number that is a Fibonacci number and it's also prime. + >>> prime_fib(1) + 2 + >>> prime_fib(2) + 3 + >>> prime_fib(3) + 5 + >>> prime_fib(4) + 13 + >>> prime_fib(5) + 89 + """''' def streamer(subword): @@ -225,7 +241,7 @@ generation is finished, we will write streamer function. start_time = time.perf_counter() - pipe.generate(["Sun is yellow because"], config, streamer=streamer) + pipe.generate(prompt, config, streamer=streamer) end_time = time.perf_counter() @@ -239,7 +255,7 @@ generation is finished, we will write streamer function. print(f"Generation time: {end_time - start_time:.2f}s") del pipe - gc.collect(); + gc.collect() .. parsed-literal:: @@ -282,17 +298,19 @@ stops the current token generation iteration is not yet reached. scheduler_config = ov_genai.SchedulerConfig() # cache params - scheduler_config.cache_size = 2 + scheduler_config.cache_size = 0 + scheduler_config.num_kv_blocks = 2048 // 8 + scheduler_config.max_num_batched_tokens = 2048 draft_model = ov_genai.draft_model(draft_model_path, device.value) pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config) config = ov_genai.GenerationConfig() - config.max_new_tokens = 100 - config.num_assistant_tokens = 3 + config.max_new_tokens = 330 + config.num_assistant_tokens = 5 start_time = time.perf_counter() - result = pipe.generate(["Sun is yellow because"], config, streamer=streamer) + result = pipe.generate(prompt, config, streamer=streamer) end_time = time.perf_counter() diff --git a/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst b/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst index 0b9b8db99880b6..27fad907b62fd6 100644 --- a/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst +++ b/docs/notebooks/speech-recognition-quantization-wav2vec2-with-output.rst @@ -57,47 +57,47 @@ Guide =0.11.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.5.2) - Requirement already satisfied: torch>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2.4.1+cpu) - Requirement already satisfied: filelock in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.16.1) - Requirement already satisfied: numpy>=1.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (1.24.4) - Requirement already satisfied: pyarrow>=15.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (17.0.0) - Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.3.8) - Requirement already satisfied: pandas in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.0.3) - Requirement already satisfied: requests>=2.32.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.32.3) - Requirement already satisfied: tqdm>=4.66.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (4.67.0) - Requirement already satisfied: xxhash in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.5.0) - Requirement already satisfied: multiprocess in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.70.16) - Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1) - Requirement already satisfied: aiohttp in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.10.11) - Requirement already satisfied: huggingface-hub>=0.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.25.2) - Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (24.2) - Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (6.0.2) - Requirement already satisfied: lightning-utilities>=0.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (0.11.9) - Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (4.12.2) - Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (1.13.3) - Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1) - Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1.4) - Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (2.4.3) - Requirement already satisfied: aiosignal>=1.1.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1) - Requirement already satisfied: attrs>=17.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (24.2.0) - Requirement already satisfied: frozenlist>=1.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.5.0) - Requirement already satisfied: multidict<7.0,>=4.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (6.1.0) - Requirement already satisfied: yarl<2.0,>=1.12.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.15.2) - Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (5.0.1) - Requirement already satisfied: setuptools in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from lightning-utilities>=0.8.0->torchmetrics>=0.11.0) (75.3.0) - Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.4.0) - Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.10) - Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2.2.3) - Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2024.8.30) - Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch>=2.1.0) (2.1.5) - Requirement already satisfied: python-dateutil>=2.8.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2.9.0.post0) - Requirement already satisfied: pytz>=2020.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) - Requirement already satisfied: tzdata>=2022.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) - Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch>=2.1.0) (1.3.0) - Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0) - Requirement already satisfied: propcache>=0.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0) + Requirement already satisfied: datasets in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (3.0.0) + Requirement already satisfied: torchmetrics>=0.11.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (1.5.2) + Requirement already satisfied: torch>=2.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (2.4.1+cpu) + Requirement already satisfied: filelock in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.16.1) + Requirement already satisfied: numpy>=1.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (1.24.4) + Requirement already satisfied: pyarrow>=15.0.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (17.0.0) + Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.3.8) + Requirement already satisfied: pandas in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.0.3) + Requirement already satisfied: requests>=2.32.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (2.32.3) + Requirement already satisfied: tqdm>=4.66.3 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (4.67.1) + Requirement already satisfied: xxhash in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.5.0) + Requirement already satisfied: multiprocess in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.70.16) + Requirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets) (2024.6.1) + Requirement already satisfied: aiohttp in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (3.10.11) + Requirement already satisfied: huggingface-hub>=0.22.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (0.25.2) + Requirement already satisfied: packaging in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (24.2) + Requirement already satisfied: pyyaml>=5.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from datasets) (6.0.2) + Requirement already satisfied: lightning-utilities>=0.8.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (0.11.9) + Requirement already satisfied: typing-extensions in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torchmetrics>=0.11.0) (4.12.2) + Requirement already satisfied: sympy in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (1.13.3) + Requirement already satisfied: networkx in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1) + Requirement already satisfied: jinja2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from torch>=2.1.0) (3.1.4) + Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (2.4.4) + Requirement already satisfied: aiosignal>=1.1.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.3.1) + Requirement already satisfied: attrs>=17.3.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (24.2.0) + Requirement already satisfied: frozenlist>=1.1.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.5.0) + Requirement already satisfied: multidict<7.0,>=4.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (6.1.0) + Requirement already satisfied: yarl<2.0,>=1.12.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (1.15.2) + Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from aiohttp->datasets) (5.0.1) + Requirement already satisfied: setuptools in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from lightning-utilities>=0.8.0->torchmetrics>=0.11.0) (75.3.0) + Requirement already satisfied: charset-normalizer<4,>=2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.4.0) + Requirement already satisfied: idna<4,>=2.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (3.10) + Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2.2.3) + Requirement already satisfied: certifi>=2017.4.17 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from requests>=2.32.2->datasets) (2024.8.30) + Requirement already satisfied: MarkupSafe>=2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from jinja2->torch>=2.1.0) (2.1.5) + Requirement already satisfied: python-dateutil>=2.8.2 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2.9.0.post0) + Requirement already satisfied: pytz>=2020.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) + Requirement already satisfied: tzdata>=2022.1 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from pandas->datasets) (2024.2) + Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from sympy->torch>=2.1.0) (1.3.0) + Requirement already satisfied: six>=1.5 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0) + Requirement already satisfied: propcache>=0.2.0 in /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets) (0.2.0) Note: you may need to restart the kernel to use updated packages. Note: you may need to restart the kernel to use updated packages. @@ -119,8 +119,8 @@ Imports .. parsed-literal:: - 2024-11-22 05:08:52.722966: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:08:52.748262: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:33:33.150578: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:33:33.175323: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -177,10 +177,10 @@ IR). .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): @@ -507,7 +507,7 @@ quantized model. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:62: FutureWarning: Importing `WordErrorRate` from `torchmetrics` was deprecated and will be removed in 2.0. Import `WordErrorRate` from `torchmetrics.text` instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:62: FutureWarning: Importing `WordErrorRate` from `torchmetrics` was deprecated and will be removed in 2.0. Import `WordErrorRate` from `torchmetrics.text` instead. _future_warning( @@ -577,7 +577,7 @@ models. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 18.23 ms + [ INFO ] Read model took 17.68 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [?,?] @@ -586,14 +586,14 @@ models. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: '45': [1,30480] - [ INFO ] Reshape model took 4.39 ms + [ INFO ] Reshape model took 4.15 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [1,30480] [ INFO ] Model outputs: [ INFO ] logits (node: __module.lm_head/aten::linear/Add) : f32 / [...] / [1,95,32] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 493.97 ms + [ INFO ] Compile model took 492.85 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -630,17 +630,17 @@ models. [ INFO ] Fill input '45' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 120000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 69.21 ms + [ INFO ] First inference took 70.68 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 5430 iterations - [ INFO ] Duration: 120128.75 ms + [ INFO ] Count: 5424 iterations + [ INFO ] Duration: 120129.30 ms [ INFO ] Latency: - [ INFO ] Median: 130.74 ms - [ INFO ] Average: 132.58 ms - [ INFO ] Min: 66.32 ms - [ INFO ] Max: 307.29 ms - [ INFO ] Throughput: 45.20 FPS + [ INFO ] Median: 130.69 ms + [ INFO ] Average: 132.71 ms + [ INFO ] Min: 66.95 ms + [ INFO ] Max: 336.57 ms + [ INFO ] Throughput: 45.15 FPS .. code:: ipython3 @@ -667,7 +667,7 @@ models. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 25.21 ms + [ INFO ] Read model took 24.12 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [?,?] @@ -676,14 +676,14 @@ models. [Step 5/11] Resizing model to match image sizes and given batch [ INFO ] Model batch size: 1 [ INFO ] Reshaping model: '45': [1,30480] - [ INFO ] Reshape model took 6.04 ms + [ INFO ] Reshape model took 6.07 ms [Step 6/11] Configuring input of the model [ INFO ] Model inputs: [ INFO ] input_values , 45 (node: input_values) : f32 / [...] / [1,30480] [ INFO ] Model outputs: [ INFO ] logits (node: __module.lm_head/aten::linear/Add) : f32 / [...] / [1,95,32] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 1188.53 ms + [ INFO ] Compile model took 1216.49 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: Model0 @@ -720,15 +720,15 @@ models. [ INFO ] Fill input '45' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 120000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 55.48 ms + [ INFO ] First inference took 54.72 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 8046 iterations - [ INFO ] Duration: 120134.14 ms + [ INFO ] Count: 8016 iterations + [ INFO ] Duration: 120075.42 ms [ INFO ] Latency: - [ INFO ] Median: 88.11 ms - [ INFO ] Average: 89.43 ms - [ INFO ] Min: 71.74 ms - [ INFO ] Max: 270.18 ms - [ INFO ] Throughput: 66.98 FPS + [ INFO ] Median: 88.25 ms + [ INFO ] Average: 89.73 ms + [ INFO ] Min: 39.47 ms + [ INFO ] Max: 249.83 ms + [ INFO ] Throughput: 66.76 FPS diff --git a/docs/notebooks/speechbrain-emotion-recognition-with-output.rst b/docs/notebooks/speechbrain-emotion-recognition-with-output.rst index 23857ad92d4fa2..0f2b2a55f67169 100644 --- a/docs/notebooks/speechbrain-emotion-recognition-with-output.rst +++ b/docs/notebooks/speechbrain-emotion-recognition-with-output.rst @@ -63,9 +63,9 @@ Installations detectron2 0.6 requires iopath<0.1.10,>=0.1.7, but you have iopath 0.1.10 which is incompatible. mobileclip 0.1.0 requires torchvision==0.14.1, but you have torchvision 0.19.1+cpu which is incompatible. modelscope-studio 0.5.2 requires gradio<6.0,>=4.0, but you have gradio 3.43.1 which is incompatible. - parler-tts 0.2.1 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. - parler-tts 0.2.1 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. - pydantic 2.10.0 requires typing-extensions>=4.12.2, but you have typing-extensions 4.9.0 which is incompatible. + parler-tts 0.2.2 requires protobuf>=4.0.0, but you have protobuf 3.20.3 which is incompatible. + parler-tts 0.2.2 requires transformers<=4.46.1,>=4.46.1, but you have transformers 4.46.3 which is incompatible. + pydantic 2.10.3 requires typing-extensions>=4.12.2, but you have typing-extensions 4.9.0 which is incompatible. tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible. tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible. tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.13.0 which is incompatible. @@ -95,7 +95,7 @@ Imports .. parsed-literal:: - INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling] + INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32] INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): [] @@ -135,8 +135,8 @@ SpeechBrain codebase. INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached INFO:speechbrain.utils.fetching:Fetch custom_interface.py: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached - 2024-11-22 05:15:27.494190: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:15:27.518517: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:40:05.072169: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:40:05.097896: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -148,7 +148,7 @@ SpeechBrain codebase. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/configuration_utils.py:306: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/configuration_utils.py:306: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`. warnings.warn( @@ -175,7 +175,7 @@ SpeechBrain codebase. INFO:speechbrain.utils.fetching:Fetch model.ckpt: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' if not cached INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: wav2vec2, model, label_encoder - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/speechbrain/utils/checkpoints.py:200: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/speechbrain/utils/checkpoints.py:200: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. state_dict = torch.load(path, map_location=device) @@ -263,13 +263,19 @@ Step 2: Convert model to OpenVINO IR .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:872: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + +.. parsed-literal:: + + model.safetensors: 0%| | 0.00/380M [00:00 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: @@ -306,7 +306,7 @@ here, we always use fixed shapes in conversion by using an .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_stable_cascade.py:548: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_stable_cascade.py:548: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)): diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg index c26f6d2e4e6256..a09f1e5356f98d 100644 --- a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f1d0c9a1548ea1728d293d5e9864b85f3f438666fb647d5d98ce4a08bd9d494 -size 81141 +oid sha256:a7c11f26f9dc1eb3286c357bb147d12c812786f1796a8b62a5012075afe6de12 +size 83987 diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png index 8d36ff65c9eca3..eece770ac13fce 100644 --- a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_29_2.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a807ec75efd8572779d2c5de64bec882d23a29b52449e0a2df13fb67b527beae -size 1575960 +oid sha256:a32e4037dd5a34d227f3ef5a892121797617a3becd465227678a6ef6d7f8a090 +size 1608106 diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.jpg b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.jpg new file mode 100644 index 00000000000000..57b41a7f8d9bbe --- /dev/null +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb107211ea8c2d5b7f376c7896193df1b1b5c4b1ba4014e10734d5401848fada +size 92085 diff --git a/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.png b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.png new file mode 100644 index 00000000000000..e718da40df51ae --- /dev/null +++ b/docs/notebooks/stable-cascade-image-generation-with-output_files/stable-cascade-image-generation-with-output_8_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b04c8ee319ce2d23bd4cf76666af100b2852bb0dd4ba558d978698f871f581 +size 1591012 diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst b/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst index 7f23c866161568..d5cbb62354f4fc 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output.rst @@ -193,8 +193,8 @@ Additionally, LCM requires using LCMScheduler for efficient generation. .. parsed-literal:: - 2024-11-22 05:28:32.243878: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. - 2024-11-22 05:28:32.268737: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. + 2024-12-10 05:53:08.894939: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. + 2024-12-10 05:53:08.920444: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. @@ -206,7 +206,7 @@ Additionally, LCM requires using LCMScheduler for efficient generation. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead. warnings.warn( @@ -288,10 +288,10 @@ extractor as input and returns image embeddings. .. parsed-literal:: [ WARNING ] Please fix your imports. Module %s has been moved to %s. The old module will be deleted in version %s. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_utils.py:5006: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead warnings.warn( `loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`. - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/models/clip/modeling_clip.py:243: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): @@ -353,17 +353,17 @@ Model predicts the ``sample`` state for the next step. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1111: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/unets/unet_2d_condition.py:1111: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if dim % default_overall_up_factor != 0: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/embeddings.py:1801: FutureWarning: You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/embeddings.py:1801: FutureWarning: You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release. Please make sure to update your script to pass `image_embeds` as a list of tensors to suppress this warning. deprecate("image_embeds not a list", "1.0.0", deprecation_message, standard_warn=False) - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:136: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/downsampling.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:147: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:147: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! assert hidden_states.shape[1] == self.channels - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/models/upsampling.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if hidden_states.shape[0] >= 64: @@ -441,16 +441,16 @@ image in pipeline, we can discuss it in inference examples. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: - %2506 : Float(1, 4, 64, 64, strides=[16384, 4096, 64, 1], requires_grad=0, device=cpu) = aten::randn(%2500, %2501, %2502, %2503, %2504, %2505) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/torch_utils.py:81:0 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Trace had nondeterministic nodes. Did you forget call .eval() on your model? Nodes: + %2506 : Float(1, 4, 64, 64, strides=[16384, 4096, 64, 1], requires_grad=0, device=cpu) = aten::randn(%2500, %2501, %2502, %2503, %2504, %2505) # /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/diffusers/utils/torch_utils.py:81:0 This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace() _check_trace( - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/torch/jit/_trace.py:1303: TracerWarning: Output nr 1. of the traced function does not match the corresponding output of the Python function. Detailed error: Tensor-likes are not close! - Mismatched elements: 10391 / 16384 (63.4%) - Greatest absolute difference: 0.000982522964477539 at index (0, 1, 0, 60) (up to 1e-05 allowed) - Greatest relative difference: 0.014704568038430557 at index (0, 3, 63, 59) (up to 1e-05 allowed) + Mismatched elements: 10463 / 16384 (63.9%) + Greatest absolute difference: 0.001137852668762207 at index (0, 2, 0, 6) (up to 1e-05 allowed) + Greatest relative difference: 0.006470232386295268 at index (0, 3, 63, 59) (up to 1e-05 allowed) _check_trace( @@ -496,9 +496,9 @@ hidden states. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:88: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if input_shape[-1] > 1 or self.sliding_window is not None: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/.venv/lib/python3.8/site-packages/transformers/modeling_attn_mask_utils.py:164: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs! if past_key_values_length > 0: diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png index c5cde5597bba55..475b4dd8ea40b4 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_22_1.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:541f65736f11c59692c577b1d85c0f6b9ab6ab51e8a9fdf6abf15063d06e2036 -size 965452 +oid sha256:f41e9dd669351422cfb30a6a5458431b699453f0934b43e199a0d4684dd4da85 +size 975310 diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png index 61d61f6001a527..ba0e885cf44c5a 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_25_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6162787bd52816b379097b9ec5284c2b65dc1178be5be7936240895f9de5285b -size 956477 +oid sha256:c5f0746a06f6d81be16e808107174009b68510b2e826885fe3f78021079b2a12 +size 945107 diff --git a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png index 937356ce2c1a55..baae1d818321e1 100644 --- a/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png +++ b/docs/notebooks/stable-diffusion-ip-adapter-with-output_files/stable-diffusion-ip-adapter-with-output_28_0.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9fe298508ac791d7d0901af522504878a7fd98675bad80cfd4b60cafd2a49c0 -size 592390 +oid sha256:76b9fd3519e90a6fa4b39a5749633ffc0031a5141f3698920b724205b304e9f3 +size 595645 diff --git a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst b/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst index e03a4ab614c769..a50a47392fb029 100644 --- a/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst +++ b/docs/notebooks/stable-diffusion-torchdynamo-backend-with-output.rst @@ -114,15 +114,18 @@ script. It speeds up PyTorch code by JIT-compiling it into optimized kernels. By default, Torch code runs in eager-mode, but with the use of torch.compile it goes through the following steps: -1. Graph acquisition - the model is rewritten as blocks of subgraphs that are either: +1. Graph acquisition + - the model is rewritten as blocks of subgraphs that are either: - compiled by TorchDynamo and “flattened”, - - falling back to the eager-mode, due to unsupported Python constructs (like control-flow + - falling back to the + eager-mode, due to unsupported Python constructs (like control-flow code). 2. Graph lowering - all PyTorch operations are decomposed into their constituent kernels specific to the chosen backend. -3. Graph compilation - the kernels call their corresponding low-level +3. Graph + compilation - the kernels call their corresponding low-level device-specific operations. Select device for inference and enable or disable saving the optimized diff --git a/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst b/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst new file mode 100644 index 00000000000000..2eee517599af7c --- /dev/null +++ b/docs/notebooks/stable-diffusion-v3-torch-fx-with-output.rst @@ -0,0 +1,562 @@ +Image generation with Torch.FX Stable Diffusion v3 and OpenVINO +=============================================================== + +Stable Diffusion V3 is next generation of latent diffusion image Stable +Diffusion models family that outperforms state-of-the-art text-to-image +generation systems in typography and prompt adherence, based on human +preference evaluations. In comparison with previous versions, it based +on Multimodal Diffusion Transformer (MMDiT) text-to-image model that +features greatly improved performance in image quality, typography, +complex prompt understanding, and resource-efficiency. + +.. figure:: https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/dd079427-89f2-4d28-a10e-c80792d750bf + :alt: mmdit.png + + mmdit.png + +More details about model can be found in `model +card `__, +`research +paper `__ +and `Stability.AI blog +post `__. In this +tutorial, we will demonstrate the optimize stable diffusion 3 in a Torch +FX representation using NNCF +`NNCF `__ for model +optimization. Additionally, we will accelerate the pipeline further by +running with torch.compile using the openvino backend. If you want to +run previous Stable Diffusion versions, please check our other +notebooks: + +- `Stable Diffusion `__ +- `Stable Diffusion v2 `__ +- `Stable Diffusion v3 `__ +- `Stable Diffusion XL `__ +- `LCM Stable + Diffusion `__ +- `Turbo SDXL `__ +- `Turbo SD `__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + + +**Table of contents:** + + +- `Prerequisites <#prerequisites>`__ +- `Build PyTorch pipeline <#build-pytorch-pipeline>`__ + + - `Store the Configs <#store-the-configs>`__ + +- `Run FP Inference <#run-fp-inference>`__ +- `Convert models to Torch FX <#convert-models-to-torch-fx>`__ +- `Quantization <#quantization>`__ + + - `Collect Calibration Dataset <#collect-calibration-dataset>`__ + - `Compress and Quantize models <#compress-and-quantize-models>`__ + - `Create Optimized Pipeline <#create-optimized-pipeline>`__ + - `Check File Size <#check-file-size>`__ + - `Optimized pipeline inference <#optimized-pipeline-inference>`__ + - `Visualize Results <#visualize-results>`__ + +- `Interactive demo <#interactive-demo>`__ + +Prerequisites +------------- + + + +.. code:: ipython3 + + %pip install -q "gradio>=4.19" "torch>=2.5" "torchvision>=0.20" "numpy<2.0" "transformers" "datasets>=2.14.6" "opencv-python" "pillow" "peft>=0.7.0" "diffusers>=0.31.0" --extra-index-url https://download.pytorch.org/whl/cpu + %pip install -qU "openvino>=2024.3.0" + %pip install -q "nncf>=2.14.0" "typing_extensions>=4.11" + +.. code:: ipython3 + + from pathlib import Path + + import requests + + if not Path("sd3_torch_fx_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/sd3_torch_fx_helper.py") + open("sd3_torch_fx_helper.py", "w").write(r.text) + + if not Path("gradio_helper.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-v3/gradio_helper.py") + open("gradio_helper.py", "w").write(r.text) + + if not Path("notebook_utils.py").exists(): + r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py") + open("notebook_utils.py", "w").write(r.text) + +Build PyTorch pipeline +---------------------- + + + + **Note**: run model with notebook, you will need to accept license + agreement. You must be a registered user in Hugging Face Hub. + Please visit `HuggingFace model + card `__, + carefully read terms of usage and click accept button. You will need + to use an access token for the code below to run. For more + information on access tokens, refer to `this section of the + documentation `__. + You can login on Hugging Face Hub in notebook environment, using + following code: + +.. code:: ipython3 + + # uncomment these lines to login to huggingfacehub to get access to pretrained model + + # from huggingface_hub import notebook_login, whoami + + # try: + # whoami() + # print('Authorization token already provided') + # except OSError: + # notebook_login() + +.. code:: ipython3 + + from sd3_torch_fx_helper import get_sd3_pipeline, init_pipeline + + pipe = get_sd3_pipeline() + pipe.to("cpu") + +Store the Configs +~~~~~~~~~~~~~~~~~ + + + +This will be used later when wrapping the Torch FX models to insert back +into the pipeline + +.. code:: ipython3 + + configs_dict = {} + configs_dict["text_encoder"] = pipe.text_encoder.config + configs_dict["text_encoder_2"] = pipe.text_encoder_2.config + configs_dict["transformer"] = pipe.transformer.config + configs_dict["vae"] = pipe.vae.config + + pipe_config = pipe.config + +Run FP Inference +---------------- + + + +.. code:: ipython3 + + import numpy as np + import torch + + generator = torch.Generator(device="cpu").manual_seed(42) + prompt = "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors" + num_inference_steps = 28 + with torch.no_grad(): + image = pipe( + prompt=prompt, + negative_prompt="", + num_inference_steps=num_inference_steps, + generator=generator, + guidance_scale=5, + ).images[0] + image.resize( + ( + 512, + 512, + ) + ) + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget() + + device + +Convert models to Torch FX +-------------------------- + + + +This step converts the pytorch models in the hf pipeline to Torch FX +representation using the ``capture_pre_autograd()`` function. + +The pipeline consists of four important parts: + +- Clip and T5 Text Encoders to create condition to generate an image + from a text prompt. +- Transformer for step-by-step denoising latent image representation. +- Autoencoder (VAE) for decoding latent space to image. + +.. code:: ipython3 + + import torch + from nncf.torch.dynamic_graph.patch_pytorch import disable_patching + + text_encoder_input = torch.ones((1, 77), dtype=torch.long) + text_encoder_kwargs = {} + text_encoder_kwargs["output_hidden_states"] = True + + vae_encoder_input = torch.ones((1, 3, 128, 128)) + vae_decoder_input = torch.ones((1, 16, 128, 128)) + + unet_kwargs = {} + unet_kwargs["hidden_states"] = torch.ones((2, 16, 128, 128)) + unet_kwargs["timestep"] = torch.from_numpy(np.array([1, 2], dtype=np.float32)) + unet_kwargs["encoder_hidden_states"] = torch.ones((2, 154, 4096)) + unet_kwargs["pooled_projections"] = torch.ones((2, 2048)) + + with torch.no_grad(): + with disable_patching(): + text_encoder = torch.export.export_for_training( + pipe.text_encoder.eval(), + args=(text_encoder_input,), + kwargs=(text_encoder_kwargs), + ).module() + text_encoder_2 = torch.export.export_for_training( + pipe.text_encoder_2.eval(), + args=(text_encoder_input,), + kwargs=(text_encoder_kwargs), + ).module() + pipe.vae.decoder = torch.export.export_for_training(pipe.vae.decoder.eval(), args=(vae_decoder_input,)).module() + pipe.vae.encoder = torch.export.export_for_training(pipe.vae.encoder.eval(), args=(vae_encoder_input,)).module() + vae = pipe.vae + transformer = torch.export.export_for_training(pipe.transformer.eval(), args=(), kwargs=(unet_kwargs)).module() + models_dict = {} + models_dict["transformer"] = transformer + models_dict["vae"] = vae + models_dict["text_encoder"] = text_encoder + models_dict["text_encoder_2"] = text_encoder_2 + del unet_kwargs + del vae_encoder_input + del vae_decoder_input + del text_encoder_input + del text_encoder_kwargs + del pipe + +Quantization +------------ + + + +`NNCF `__ enables +post-training quantization by adding quantization layers into model +graph and then using a subset of the training dataset to initialize the +parameters of these additional quantization layers. Quantized operations +are executed in ``INT8`` instead of ``FP32``/``FP16`` making model +inference faster. + +According to ``StableDiffusion3Pipeline`` structure, the ``transformer`` +model takes up significant portion of the overall pipeline execution +time. Now we will show you how to optimize the transformer part using +`NNCF `__ to reduce +computation cost and speed up the pipeline. Quantizing the rest of the +pipeline does not significantly improve inference performance but can +lead to a substantial degradation of accuracy. That’s why we use 8-bit +weight compression for the rest of the pipeline to reduce memory +footprint. + +Please select below whether you would like to run quantization to +improve model inference speed. + + **NOTE**: Quantization is time and memory consuming operation. + Running quantization code below may take some time. + +.. code:: ipython3 + + from notebook_utils import quantization_widget + + to_quantize = quantization_widget() + + to_quantize + +Let’s load ``skip magic`` extension to skip quantization if +``to_quantize`` is not selected + +.. code:: ipython3 + + # Fetch `skip_kernel_extension` module + import requests + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", + ) + open("skip_kernel_extension.py", "w").write(r.text) + + %load_ext skip_kernel_extension + +Collect Calibration Dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + from typing import Any, Dict, List + + import datasets + from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel + from tqdm.notebook import tqdm + + + def disable_progress_bar(pipeline, disable=True): + if not hasattr(pipeline, "_progress_bar_config"): + pipeline._progress_bar_config = {"disable": disable} + else: + pipeline._progress_bar_config["disable"] = disable + + + class UNetWrapper(SD3Transformer2DModel): + def __init__(self, transformer, config): + super().__init__(**config) + self.transformer = transformer + self.captured_args = [] + + def forward(self, *args, **kwargs): + del kwargs["joint_attention_kwargs"] + del kwargs["return_dict"] + self.captured_args.append((*args, *tuple(kwargs.values()))) + return self.transformer(*args, **kwargs) + + + def collect_calibration_data( + pipe, calibration_dataset_size: int, num_inference_steps: int + ) -> List[Dict]: + + original_unet = pipe.transformer + calibration_data = [] + disable_progress_bar(pipe) + + dataset = datasets.load_dataset( + "google-research-datasets/conceptual_captions", + split="train", + trust_remote_code=True, + ).shuffle(seed=42) + + transformer_config = dict(pipe.transformer.config) + del transformer_config["model"] + wrapped_unet = UNetWrapper(pipe.transformer.model, transformer_config) + pipe.transformer = wrapped_unet + # Run inference for data collection + pbar = tqdm(total=calibration_dataset_size) + for i, batch in enumerate(dataset): + prompt = batch["caption"] + if len(prompt) > pipe.tokenizer.model_max_length: + continue + # Run the pipeline + pipe(prompt, num_inference_steps=num_inference_steps) + calibration_data.extend(wrapped_unet.captured_args) + wrapped_unet.captured_args = [] + pbar.update(len(calibration_data) - pbar.n) + if pbar.n >= calibration_dataset_size: + break + + disable_progress_bar(pipe, disable=False) + pipe.transformer = original_unet + return calibration_data + + + if to_quantize: + pipe = init_pipeline(models_dict, configs_dict) + calibration_dataset_size = 300 + unet_calibration_data = collect_calibration_data( + pipe, calibration_dataset_size=calibration_dataset_size, num_inference_steps=28 + ) + del pipe + +Compress and Quantize models +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + import nncf + from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters + from nncf.quantization.range_estimator import RangeEstimatorParametersSet + + text_encoder = models_dict["text_encoder"] + text_encoder_2 = models_dict["text_encoder_2"] + vae_encoder = models_dict["vae"].encoder + vae_decoder = models_dict["vae"].decoder + original_transformer = models_dict["transformer"] + if to_quantize: + with disable_patching(): + with torch.no_grad(): + nncf.compress_weights(text_encoder) + nncf.compress_weights(text_encoder_2) + nncf.compress_weights(vae_encoder) + nncf.compress_weights(vae_decoder) + quantized_transformer = nncf.quantize( + model=original_transformer, + calibration_dataset=nncf.Dataset(unet_calibration_data), + subset_size=len(unet_calibration_data), + model_type=nncf.ModelType.TRANSFORMER, + ignored_scope=nncf.IgnoredScope(names=["conv2d"]), + advanced_parameters=nncf.AdvancedQuantizationParameters( + weights_range_estimator_params=RangeEstimatorParametersSet.MINMAX, + activations_range_estimator_params=RangeEstimatorParametersSet.MINMAX, + ), + ) + + optimized_models_dict = {} + optimized_models_dict["transformer"] = quantized_transformer + optimized_models_dict["vae"] = vae + optimized_models_dict["text_encoder"] = text_encoder + optimized_models_dict["text_encoder_2"] = text_encoder_2 + del models_dict + +.. code:: ipython3 + + %%skip not $to_quantize.value + import openvino.torch + + optimized_models_dict["text_encoder"] = torch.compile( + optimized_models_dict["text_encoder"], backend="openvino" + ) + optimized_models_dict["text_encoder_2"] = torch.compile( + optimized_models_dict["text_encoder_2"], backend="openvino" + ) + optimized_models_dict["vae"].encoder = torch.compile( + optimized_models_dict["vae"].encoder, backend="openvino" + ) + optimized_models_dict["vae"].decoder = torch.compile( + optimized_models_dict["vae"].decoder, backend="openvino" + ) + optimized_models_dict["transformer"] = torch.compile( + optimized_models_dict["transformer"], backend="openvino" + ) + +Create Optimized Pipeline +~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Initialize the optimized pipeline using the optimized models + +.. code:: ipython3 + + %%skip not $to_quantize.value + + opt_pipe = init_pipeline(optimized_models_dict, configs_dict) + +Check File Size +~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + + def get_model_size(models): + total_size = 0 + for model in models: + param_size = 0 + for param in model.parameters(): + param_size += param.nelement() * param.element_size() + buffer_size = 0 + for buffer in model.buffers(): + buffer_size += buffer.nelement() * buffer.element_size() + + model_size_mb = (param_size + buffer_size) / 1024**2 + + total_size += model_size_mb + return total_size + + + optimized_model_size = get_model_size([opt_pipe.transformer]) + original_model_size = get_model_size([original_transformer]) + + print(f"Original Transformer Size: {original_model_size} MB") + print(f"Optimized Transformer Size: {optimized_model_size} MB") + print(f"Compression Rate: {original_model_size / optimized_model_size:.3f}") + +Optimized pipeline inference +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Run inference with single step to compile the model. + +.. code:: ipython3 + + %%skip not $to_quantize.value + + # Warmup the model for initial compile + with torch.no_grad(): + image = opt_pipe( + prompt=prompt, negative_prompt="", num_inference_steps=1, generator=generator + ).images[0] + +Visualize Results +~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + %%skip not $to_quantize.value + + from sd3_torch_fx_helper import visualize_results + + generator = torch.Generator(device="cpu").manual_seed(42) + opt_image = opt_pipe( + prompt, + negative_prompt="", + num_inference_steps=28, + guidance_scale=5, + generator=generator, + ).images[0] + + visualize_results(image, opt_image) + +Interactive demo +---------------- + + + +Please select below whether you would like to use the quantized models +to launch the interactive demo. + +.. code:: ipython3 + + use_quantized_models = quantization_widget() + + use_quantized_models + +.. code:: ipython3 + + from gradio_helper import make_demo + + fx_pipe = init_pipeline(models_dict if not to_quantize.value else optimized_models_dict, configs_dict) + demo = make_demo(fx_pipe, False) + + # if you are launching remotely, specify server_name and server_port + # demo.launch(server_name='your server name', server_port='server port in int') + # if you have any issue to launch on your platform, you can pass share=True to launch method: + # demo.launch(share=True) + # it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/ + try: + demo.launch(debug=True) + except Exception: + demo.launch(debug=True, share=True) diff --git a/docs/notebooks/stable-diffusion-xl-with-output.rst b/docs/notebooks/stable-diffusion-xl-with-output.rst index 54a43191c229a4..7ec1c0c81eeb20 100644 --- a/docs/notebooks/stable-diffusion-xl-with-output.rst +++ b/docs/notebooks/stable-diffusion-xl-with-output.rst @@ -100,9 +100,9 @@ Install prerequisites .. code:: ipython3 - # %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1" "torchvision" "diffusers>=0.24.0" "invisible-watermark>=0.2.0" "transformers>=4.33.0" "accelerate" "onnx!=1.16.2" "peft>=0.6.2" - # %pip install -q "git+https://github.com/huggingface/optimum-intel.git" - # %pip install -q "openvino>=2023.1.0" "gradio>=4.19" "nncf>=2.9.0" + %pip install -q --extra-index-url https://download.pytorch.org/whl/cpu "torch>=2.1" "torchvision" "diffusers>=0.24.0" "invisible-watermark>=0.2.0" "transformers>=4.33.0" "accelerate" "onnx!=1.16.2" "peft>=0.6.2" + %pip install -q "git+https://github.com/huggingface/optimum-intel.git" + %pip install -q "openvino>=2023.1.0" "gradio>=4.19" "nncf>=2.9.0" SDXL Base model --------------- diff --git a/docs/notebooks/style-transfer-with-output.rst b/docs/notebooks/style-transfer-with-output.rst index b123ca215cbbfc..c228604aee32f9 100644 --- a/docs/notebooks/style-transfer-with-output.rst +++ b/docs/notebooks/style-transfer-with-output.rst @@ -96,7 +96,7 @@ Install requirements .. parsed-literal:: - 24717 + 24624 @@ -186,14 +186,14 @@ OpenVINO Intermediate Representation (IR) with ``FP16`` precision. .. parsed-literal:: - model/mosaic-9.onnx: 0%| | 0.00/6.42M [00:00`__. It uses +`BiT-M-R50x1/1 `__ +model, which is trained on ImageNet-21k. Big Transfer is a recipe for +pre-training image classification models on large supervised datasets +and efficiently fine-tuning them on any given target task. The recipe +achieves excellent performance on a wide variety of tasks, even when +using very few labeled examples from the target dataset. This tutorial +uses OpenVINO backend for performing model quantization in NNCF. + + +**Table of contents:** + + +- `Prepare Dataset <#prepare-dataset>`__ +- `Plotting data samples <#plotting-data-samples>`__ +- `Model Fine-tuning <#model-fine-tuning>`__ +- `Perform model optimization (IR) + step <#perform-model-optimization-ir-step>`__ +- `Compute accuracy of the TF + model <#compute-accuracy-of-the-tf-model>`__ +- `Compute accuracy of the OpenVINO + model <#compute-accuracy-of-the-openvino-model>`__ +- `Quantize OpenVINO model using + NNCF <#quantize-openvino-model-using-nncf>`__ +- `Compute accuracy of the quantized + model <#compute-accuracy-of-the-quantized-model>`__ +- `Compare FP32 and INT8 accuracy <#compare-fp32-and-int8-accuracy>`__ +- `Compare inference results on one + picture <#compare-inference-results-on-one-picture>`__ + +Installation Instructions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a self-contained example that relies solely on its own code. + +We recommend running the notebook in a virtual environment. You only +need a Jupyter server to start. For details, please refer to +`Installation +Guide `__. + +.. code:: ipython3 + + import platform + + %pip install -q "tensorflow-macos>=2.5; sys_platform == 'darwin' and platform_machine == 'arm64' and python_version > '3.8'" # macOS M1 and M2 + %pip install -q "tensorflow>=2.5; sys_platform == 'darwin' and platform_machine != 'arm64' and python_version > '3.8'" # macOS x86 + %pip install -q "tensorflow>=2.5; sys_platform != 'darwin' and python_version > '3.8'" + + %pip install -q "openvino>=2024.0.0" "nncf>=2.7.0" "tensorflow-hub>=0.15.0" tf_keras + %pip install -q "scikit-learn>=1.3.2" + + if platform.system() != "Windows": + %pip install -q "matplotlib>=3.4" "tensorflow_datasets>=4.9.0" + else: + %pip install -q "matplotlib>=3.4" "tensorflow_datasets>=4.9.0,<4.9.3" + +.. code:: ipython3 + + import os + import numpy as np + from pathlib import Path + + from openvino.runtime import Core + import openvino as ov + import nncf + import logging + + from nncf.common.logging.logger import set_log_level + + set_log_level(logging.ERROR) + + from sklearn.metrics import accuracy_score + + os.environ["TF_USE_LEGACY_KERAS"] = "1" + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" + os.environ["TFHUB_CACHE_DIR"] = str(Path("./tfhub_modules").resolve()) + + import tensorflow as tf + import tensorflow_datasets as tfds + import tensorflow_hub as hub + + tfds.core.utils.gcs_utils._is_gcs_disabled = True + os.environ["NO_GCE_CHECK"] = "true" + + import requests + + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) + open("notebook_utils.py", "w").write(r.text) + +.. code:: ipython3 + + core = Core() + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + + + # For top 5 labels. + MAX_PREDS = 1 + TRAINING_BATCH_SIZE = 128 + BATCH_SIZE = 1 + IMG_SIZE = (256, 256) # Default Imagenet image size + NUM_CLASSES = 10 # For Imagenette dataset + FINE_TUNING_STEPS = 1 + LR = 1e-5 + + MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255) # From Imagenet dataset + STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255) # From Imagenet dataset + +Prepare Dataset +~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + datasets, datasets_info = tfds.load( + "imagenette/160px", + shuffle_files=True, + as_supervised=True, + with_info=True, + read_config=tfds.ReadConfig(shuffle_seed=0), + ) + train_ds, validation_ds = datasets["train"], datasets["validation"] + +.. code:: ipython3 + + def preprocessing(image, label): + image = tf.image.resize(image, IMG_SIZE) + image = tf.cast(image, tf.float32) / 255.0 + label = tf.one_hot(label, NUM_CLASSES) + return image, label + + + train_dataset = train_ds.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(TRAINING_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) + validation_dataset = ( + validation_ds.map(preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(TRAINING_BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE) + ) + +.. code:: ipython3 + + # Class labels dictionary with imagenette sample names and classes + lbl_dict = dict( + n01440764="tench", + n02102040="English springer", + n02979186="cassette player", + n03000684="chain saw", + n03028079="church", + n03394916="French horn", + n03417042="garbage truck", + n03425413="gas pump", + n03445777="golf ball", + n03888257="parachute", + ) + + # Imagenette samples name index + class_idx_dict = [ + "n01440764", + "n02102040", + "n02979186", + "n03000684", + "n03028079", + "n03394916", + "n03417042", + "n03425413", + "n03445777", + "n03888257", + ] + + + def label_func(key): + return lbl_dict[key] + +Plotting data samples +~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + import matplotlib.pyplot as plt + + # Get the class labels from the dataset info + class_labels = datasets_info.features["label"].names + + # Display labels along with the examples + num_examples_to_display = 4 + fig, axes = plt.subplots(nrows=1, ncols=num_examples_to_display, figsize=(10, 5)) + + for i, (image, label_index) in enumerate(train_ds.take(num_examples_to_display)): + label_name = class_labels[label_index.numpy()] + + axes[i].imshow(image.numpy()) + axes[i].set_title(f"{label_func(label_name)}") + axes[i].axis("off") + plt.tight_layout() + plt.show() + + +.. parsed-literal:: + + 2024-01-26 10:40:54.747316: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead. + + + +.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png + + +.. code:: ipython3 + + # Get the class labels from the dataset info + class_labels = datasets_info.features["label"].names + + # Display labels along with the examples + num_examples_to_display = 4 + fig, axes = plt.subplots(nrows=1, ncols=num_examples_to_display, figsize=(10, 5)) + + for i, (image, label_index) in enumerate(validation_ds.take(num_examples_to_display)): + label_name = class_labels[label_index.numpy()] + + axes[i].imshow(image.numpy()) + axes[i].set_title(f"{label_func(label_name)}") + axes[i].axis("off") + plt.tight_layout() + plt.show() + + +.. parsed-literal:: + + 2024-01-26 10:40:57.011386: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead. + + + +.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png + + +Model Fine-tuning +~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + # Load the Big Transfer model + bit_model_url = "https://www.kaggle.com/models/google/bit/frameworks/TensorFlow2/variations/m-r50x1/versions/1" + bit_m = hub.KerasLayer(bit_model_url, trainable=True) + + tf_model_dir = Path("bit_tf_model") + + # Customize the model for the new task + model = tf.keras.Sequential([bit_m, tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")]) + + # Compile the model + model.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=LR), + loss="categorical_crossentropy", + metrics=["accuracy"], + ) + + # Fine-tune the model + model.fit( + train_dataset.take(3000), + epochs=FINE_TUNING_STEPS, + validation_data=validation_dataset.take(1000), + ) + model.save(tf_model_dir, save_format="tf") + + +.. parsed-literal:: + + 101/101 [==============================] - 472s 4s/step - loss: 0.4904 - accuracy: 0.8806 - val_loss: 0.0810 - val_accuracy: 0.9840 + + +Perform model optimization (IR) step +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + ir_path = Path("bit_ov_model/bit_m_r50x1_1.xml") + if not ir_path.exists(): + print("Initiating model optimization..!!!") + ov_model = ov.convert_model("./bit_tf_model") + ov.save_model(ov_model, ir_path) + else: + print(f"IR model {ir_path} already exists.") + + +.. parsed-literal:: + + Initiating model optimization..!!! + + +Compute accuracy of the TF model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + tf_model = tf.keras.models.load_model(tf_model_dir) + + tf_predictions = [] + gt_label = [] + + for _, label in validation_dataset: + for cls_label in label: + l_list = cls_label.numpy().tolist() + gt_label.append(l_list.index(1)) + + for img_batch, label_batch in validation_dataset: + tf_result_batch = tf_model.predict(img_batch, verbose=0) + for i in range(len(img_batch)): + tf_result = tf_result_batch[i] + tf_result = tf.reshape(tf_result, [-1]) + top5_label_idx = np.argsort(tf_result)[-MAX_PREDS::][::-1] + tf_predictions.append(top5_label_idx) + + # Convert the lists to NumPy arrays for accuracy calculation + tf_predictions = np.array(tf_predictions) + gt_label = np.array(gt_label) + + tf_acc_score = accuracy_score(tf_predictions, gt_label) + + +.. parsed-literal:: + + 2024-01-26 10:51:24.539777: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 're_lu_48/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 2 outputs. Output shapes may be inaccurate. + 2024-01-26 10:51:24.539856: W tensorflow/core/common_runtime/graph_constructor.cc:839] Node 'global_average_pooling2d/PartitionedCall' has 1 outputs but the _output_shapes attribute specifies shapes for 3 outputs. Output shapes may be inaccurate. + + +Compute accuracy of the OpenVINO model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Select device for inference: + +.. code:: ipython3 + + from notebook_utils import device_widget + + device = device_widget() + + device + +.. code:: ipython3 + + core = ov.Core() + + ov_fp32_model = core.read_model(ir_path) + ov_fp32_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # Target device set to CPU (Other options Ex: AUTO/GPU/dGPU/) + compiled_model = ov.compile_model(ov_fp32_model, device.value) + output = compiled_model.outputs[0] + + ov_predictions = [] + for img_batch, _ in validation_dataset: + for image in img_batch: + image = tf.expand_dims(image, axis=0) + pred = compiled_model(image)[output] + ov_result = tf.reshape(pred, [-1]) + top_label_idx = np.argsort(ov_result)[-MAX_PREDS::][::-1] + ov_predictions.append(top_label_idx) + + fp32_acc_score = accuracy_score(ov_predictions, gt_label) + +Quantize OpenVINO model using NNCF +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +Model Quantization using NNCF + +1. Preprocessing and preparing validation samples for NNCF calibration +2. Perform NNCF Quantization on OpenVINO FP32 model +3. Serialize Quantized OpenVINO INT8 model + +.. code:: ipython3 + + def nncf_preprocessing(image, label): + image = tf.image.resize(image, IMG_SIZE) + image = image - MEAN_RGB + image = image / STDDEV_RGB + return image + + + int8_ir_path = Path("bit_ov_int8_model/bit_m_r50x1_1_ov_int8.xml") + val_ds = validation_ds.map(nncf_preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(1).prefetch(tf.data.experimental.AUTOTUNE) + + calibration_dataset = nncf.Dataset(val_ds) + + ov_fp32_model = core.read_model(ir_path) + + ov_int8_model = nncf.quantize(ov_fp32_model, calibration_dataset, fast_bias_correction=False) + + ov.save_model(ov_int8_model, int8_ir_path) + + + +.. parsed-literal:: + + Output() + + + + + + + + + + + + + + + + + + +.. parsed-literal:: + + Output() + + + + + + + + + + + + + + + + + +Compute accuracy of the quantized model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + nncf_quantized_model = core.read_model(int8_ir_path) + nncf_quantized_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # Target device set to CPU by default + compiled_model = ov.compile_model(nncf_quantized_model, device.value) + output = compiled_model.outputs[0] + + ov_predictions = [] + inp_tensor = nncf_quantized_model.inputs[0] + out_tensor = nncf_quantized_model.outputs[0] + + for img_batch, _ in validation_dataset: + for image in img_batch: + image = tf.expand_dims(image, axis=0) + pred = compiled_model(image)[output] + ov_result = tf.reshape(pred, [-1]) + top_label_idx = np.argsort(ov_result)[-MAX_PREDS::][::-1] + ov_predictions.append(top_label_idx) + + int8_acc_score = accuracy_score(ov_predictions, gt_label) + +Compare FP32 and INT8 accuracy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + print(f"Accuracy of the tensorflow model (fp32): {tf_acc_score * 100: .2f}%") + print(f"Accuracy of the OpenVINO optimized model (fp32): {fp32_acc_score * 100: .2f}%") + print(f"Accuracy of the OpenVINO quantized model (int8): {int8_acc_score * 100: .2f}%") + accuracy_drop = fp32_acc_score - int8_acc_score + print(f"Accuracy drop between OV FP32 and INT8 model: {accuracy_drop * 100:.1f}% ") + + +.. parsed-literal:: + + Accuracy of the tensorflow model (fp32): 98.40% + Accuracy of the OpenVINO optimized model (fp32): 98.40% + Accuracy of the OpenVINO quantized model (int8): 98.00% + Accuracy drop between OV FP32 and INT8 model: 0.4% + + +Compare inference results on one picture +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + +.. code:: ipython3 + + # Accessing validation sample + sample_idx = 50 + vds = datasets["validation"] + + if len(vds) > sample_idx: + sample = vds.take(sample_idx + 1).skip(sample_idx).as_numpy_iterator().next() + else: + print("Dataset does not have enough samples...!!!") + + # Image data + sample_data = sample[0] + + # Label info + sample_label = sample[1] + + # Image data pre-processing + image = tf.image.resize(sample_data, IMG_SIZE) + image = tf.expand_dims(image, axis=0) + image = tf.cast(image, tf.float32) / 255.0 + + + # OpenVINO inference + def ov_inference(model: ov.Model, image) -> str: + compiled_model = ov.compile_model(model, device.value) + output = compiled_model.outputs[0] + pred = compiled_model(image)[output] + ov_result = tf.reshape(pred, [-1]) + pred_label = np.argsort(ov_result)[-MAX_PREDS::][::-1] + return pred_label + + + # OpenVINO FP32 model + ov_fp32_model = core.read_model(ir_path) + ov_fp32_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # OpenVINO INT8 model + ov_int8_model = core.read_model(int8_ir_path) + ov_int8_model.reshape([1, IMG_SIZE[0], IMG_SIZE[1], 3]) + + # OpenVINO FP32 model inference + ov_fp32_pred_label = ov_inference(ov_fp32_model, image) + + print(f"Predicted label for the sample picture by float (fp32) model: {label_func(class_idx_dict[int(ov_fp32_pred_label)])}\n") + + # OpenVINO FP32 model inference + ov_int8_pred_label = ov_inference(ov_int8_model, image) + print(f"Predicted label for the sample picture by qunatized (int8) model: {label_func(class_idx_dict[int(ov_int8_pred_label)])}\n") + + # Plotting the image sample with ground truth + plt.figure() + plt.imshow(sample_data) + plt.title(f"Ground truth: {label_func(class_idx_dict[sample_label])}") + plt.axis("off") + plt.show() + + +.. parsed-literal:: + + Predicted label for the sample picture by float (fp32) model: gas pump + + Predicted label for the sample picture by qunatized (int8) model: gas pump + + + + +.. image:: tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png + diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png new file mode 100644 index 00000000000000..71aa7443a92cd8 --- /dev/null +++ b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_10_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b53b19fd375df2b53791482fa4f76ec9d376be865f1298f4ea5aa0acdb1f35 +size 224517 diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png new file mode 100644 index 00000000000000..38f050c05e472a --- /dev/null +++ b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_27_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048e8ff7b7ac7fa5f9cb66251d618f1ae941f26255f62c725d6223abd63e6fb7 +size 335047 diff --git a/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png new file mode 100644 index 00000000000000..a8d02fcbd58c16 --- /dev/null +++ b/docs/notebooks/tensorflow-bit-image-classification-nncf-quantization-with-output_files/tensorflow-bit-image-classification-nncf-quantization-with-output_9_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bf1b651f79891da47103dcc27259f890451c392325a712ff4c1b1cace7cb4be +size 296205 diff --git a/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst b/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst index 2e4f5ffe25369c..9ab3ae90d2fd3e 100644 --- a/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst +++ b/docs/notebooks/tensorflow-classification-to-openvino-with-output.rst @@ -249,7 +249,7 @@ network. .. parsed-literal:: - data/coco.jpg: 0%| | 0.00/202k [00:00 + @@ -649,7 +649,7 @@ Zoo `__: .. parsed-literal:: - data/coco_91cl.txt: 0%| | 0.00/421 [00:00 + @@ -683,16 +678,10 @@ Zoo `__: -.. parsed-literal:: - - data/coco_91cl.txt: 0%| | 0.00/421 [00:00`__. -2. Run inference using the `Text to Image -pipeline `__ +2. Run inference using the `Text-to-Image Generation +pipeline `__ from OpenVINO GenAI. + **Table of contents:** + - `Prerequisites <#prerequisites>`__ - `Convert model using Optimum-CLI tool <#convert-model-using-optimum-cli-tool>`__ @@ -57,19 +59,19 @@ Prerequisites import platform import requests - - + + %pip install -q "git+https://github.com/huggingface/optimum-intel.git" %pip install -q -U "openvino>=2024.5" "openvino-tokenizers>=2024.5" "openvino-genai>=2024.5" %pip install -q Pillow "diffusers>=0.30.3" "gradio>=4.19" "typing_extensions>=4.9" if platform.system() == "Darwin": %pip install -q "numpy<2.0.0" - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) open("notebook_utils.py", "w").write(r.text) - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/cmd_helper.py", ) @@ -81,7 +83,7 @@ Convert model using Optimum-CLI tool `Optimum Intel `__ -is the interface between the +is the interface between the `Transformers `__ and `Diffusers `__ libraries and OpenVINO to accelerate end-to-end pipelines on Intel architectures. @@ -116,12 +118,12 @@ wrapper over cli-command. .. code:: ipython3 from pathlib import Path - + from cmd_helper import optimum_cli - - + + model_dir = Path("dreamlike_anime_1_0_ov") - + if not model_dir.exists(): optimum_cli("dreamlike-art/dreamlike-anime-1.0", model_dir) @@ -135,8 +137,8 @@ select device from dropdown list for running inference using OpenVINO .. code:: ipython3 from notebook_utils import device_widget - - + + device = device_widget("CPU", exclude=["NPU"]) device @@ -161,27 +163,27 @@ That’s it:) import openvino as ov from PIL import Image import torch - - + + class Generator(ov_genai.Generator): def __init__(self, seed): ov_genai.Generator.__init__(self) self.generator = torch.Generator(device="cpu").manual_seed(seed) - + def next(self): return torch.randn(1, generator=self.generator, dtype=torch.float32).item() - + def randn_tensor(self, shape: ov.Shape): torch_tensor = torch.randn(list(shape), generator=self.generator, dtype=torch.float32) return ov.Tensor(torch_tensor.numpy()) - - + + random_generator = Generator(42) # openvino_genai.CppStdGenerator can be used to have same images as C++ sample pipe = ov_genai.Text2ImagePipeline(model_dir, device.value) prompt = "anime, masterpiece, high quality, a green snowman with a happy smiling face in the snows" - + image_tensor = pipe.generate(prompt, width=512, height=512, num_inference_steps=20, num_images_per_prompt=1, generator=random_generator) - + image = Image.fromarray(image_tensor.data[0]) .. code:: ipython3 @@ -228,20 +230,20 @@ from command line: def prepare_adapter_config(adapters): adapter_config = ov_genai.AdapterConfig() - + # Multiple LoRA adapters applied simultaneously are supported, parse them all and corresponding alphas from cmd parameters: for i in range(int(len(adapters) / 2)): adapter = ov_genai.Adapter(adapters[2 * i]) alpha = float(adapters[2 * i + 1]) adapter_config.add(adapter, alpha) - + return adapter_config - - + + adapter_config = prepare_adapter_config(["soulcard.safetensors", 0.5]) - + pipe = ov_genai.Text2ImagePipeline(model_dir, device.value, adapters=adapter_config) - + image_tensor = pipe.generate(prompt, generator=Generator(42), width=512, height=512, num_inference_steps=20) image = Image.fromarray(image_tensor.data[0]) @@ -268,10 +270,10 @@ Interactive demo .. code:: ipython3 from gradio_helper import make_demo - - + + demo = make_demo(pipe, Generator, adapter_config) - + try: demo.launch(debug=True) except Exception: diff --git a/docs/notebooks/tflite-selfie-segmentation-with-output.rst b/docs/notebooks/tflite-selfie-segmentation-with-output.rst index 7f613016c47019..8691da62b77526 100644 --- a/docs/notebooks/tflite-selfie-segmentation-with-output.rst +++ b/docs/notebooks/tflite-selfie-segmentation-with-output.rst @@ -117,8 +117,7 @@ Download pretrained model and test image tflite_model_path = Path("selfie_multiclass_256x256.tflite") tflite_model_url = "https://storage.googleapis.com/mediapipe-models/image_segmenter/selfie_multiclass_256x256/float32/latest/selfie_multiclass_256x256.tflite" - if not tflite_model_path.exists(): - download_file(tflite_model_url, tflite_model_path) + download_file(tflite_model_url, tflite_model_path) @@ -127,6 +126,14 @@ Download pretrained model and test image selfie_multiclass_256x256.tflite: 0%| | 0.00/15.6M [00:00`__. .. code:: ipython3 - image = load_image("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bricks.png") + image = load_image("coco_bricks.png", "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bricks.png") # load_image reads the image in BGR format, [:,:,::-1] reshape transfroms it to RGB image = Image.fromarray(image[:, :, ::-1]) resized_image = image.resize((224, 224)) @@ -274,7 +274,7 @@ GPU. [ WARNING ] Performance hint was not explicitly specified in command line. Device(AUTO) performance hint will be set to PerformanceMode.THROUGHPUT. [Step 4/11] Reading model files [ INFO ] Loading model files - [ INFO ] Read model took 9.66 ms + [ INFO ] Read model took 9.35 ms [ INFO ] Original model I/O parameters: [ INFO ] Model inputs: [ INFO ] images (node: images) : f32 / [...] / [1,224,224,3] @@ -288,7 +288,7 @@ GPU. [ INFO ] Model outputs: [ INFO ] Softmax (node: 61) : f32 / [...] / [1,1000] [Step 7/11] Loading the model to the device - [ INFO ] Compile model took 147.18 ms + [ INFO ] Compile model took 166.78 ms [Step 8/11] Querying optimal runtime parameters [ INFO ] Model: [ INFO ] NETWORK_NAME: TensorFlow_Lite_Frontend_IR @@ -325,15 +325,15 @@ GPU. [ INFO ] Fill input 'images' with random values [Step 10/11] Measuring performance (Start inference asynchronously, 6 inference requests, limits: 15000 ms duration) [ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop). - [ INFO ] First inference took 7.26 ms + [ INFO ] First inference took 7.31 ms [Step 11/11] Dumping statistics report [ INFO ] Execution Devices:['CPU'] - [ INFO ] Count: 16578 iterations - [ INFO ] Duration: 15004.17 ms + [ INFO ] Count: 17460 iterations + [ INFO ] Duration: 15005.01 ms [ INFO ] Latency: - [ INFO ] Median: 5.30 ms - [ INFO ] Average: 5.29 ms - [ INFO ] Min: 2.92 ms - [ INFO ] Max: 17.62 ms - [ INFO ] Throughput: 1104.89 FPS + [ INFO ] Median: 4.99 ms + [ INFO ] Average: 5.02 ms + [ INFO ] Min: 2.99 ms + [ INFO ] Max: 17.05 ms + [ INFO ] Throughput: 1163.61 FPS diff --git a/docs/notebooks/tiny-sd-image-generation-with-output.rst b/docs/notebooks/tiny-sd-image-generation-with-output.rst index 2c4126b8aefc4c..090b8ff5f63378 100644 --- a/docs/notebooks/tiny-sd-image-generation-with-output.rst +++ b/docs/notebooks/tiny-sd-image-generation-with-output.rst @@ -96,9 +96,9 @@ First, load the pre-trained weights of all components of the model. import gc from diffusers import StableDiffusionPipeline - + model_id = "segmind/tiny-sd" - + pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cpu") text_encoder = pipe.text_encoder text_encoder.eval() @@ -106,7 +106,7 @@ First, load the pre-trained weights of all components of the model. unet.eval() vae = pipe.vae vae.eval() - + del pipe gc.collect() @@ -164,10 +164,10 @@ hidden states. from pathlib import Path import torch import openvino as ov - + TEXT_ENCODER_OV_PATH = Path("text_encoder.xml") - - + + def convert_encoder(text_encoder: torch.nn.Module, ir_path: Path): """ Convert Text Encoder mode. @@ -181,7 +181,7 @@ hidden states. input_ids = torch.ones((1, 77), dtype=torch.long) # switch model to inference mode text_encoder.eval() - + # disable gradients calculation for reducing memory consumption with torch.no_grad(): # Export model to IR format @@ -195,13 +195,13 @@ hidden states. ov.save_model(ov_model, ir_path) del ov_model print(f"Text Encoder successfully converted to IR and saved to {ir_path}") - - + + if not TEXT_ENCODER_OV_PATH.exists(): convert_encoder(text_encoder, TEXT_ENCODER_OV_PATH) else: print(f"Text encoder will be loaded from {TEXT_ENCODER_OV_PATH}") - + del text_encoder gc.collect(); @@ -223,12 +223,12 @@ Model predicts the ``sample`` state for the next step. import numpy as np from openvino import PartialShape, Type - + UNET_OV_PATH = Path("unet.xml") - + dtype_mapping = {torch.float32: Type.f32, torch.float64: Type.f64} - - + + def convert_unet(unet: torch.nn.Module, ir_path: Path): """ Convert U-net model to IR format. @@ -250,15 +250,15 @@ Model predicts the ``sample`` state for the next step. shape = PartialShape(tuple(input_tensor.shape)) element_type = dtype_mapping[input_tensor.dtype] input_info.append((shape, element_type)) - + unet.eval() with torch.no_grad(): ov_model = ov.convert_model(unet, example_input=dummy_inputs, input=input_info) ov.save_model(ov_model, ir_path) del ov_model print(f"Unet successfully converted to IR and saved to {ir_path}") - - + + if not UNET_OV_PATH.exists(): convert_unet(unet, UNET_OV_PATH) gc.collect() @@ -292,8 +292,8 @@ of the pipeline, it will be better to convert them to separate models. .. code:: ipython3 VAE_ENCODER_OV_PATH = Path("vae_encodr.xml") - - + + def convert_vae_encoder(vae: torch.nn.Module, ir_path: Path): """ Convert VAE model for encoding to IR format. @@ -305,15 +305,15 @@ of the pipeline, it will be better to convert them to separate models. Returns: None """ - + class VAEEncoderWrapper(torch.nn.Module): def __init__(self, vae): super().__init__() self.vae = vae - + def forward(self, image): return self.vae.encode(x=image)["latent_dist"].sample() - + vae_encoder = VAEEncoderWrapper(vae) vae_encoder.eval() image = torch.zeros((1, 3, 512, 512)) @@ -322,16 +322,16 @@ of the pipeline, it will be better to convert them to separate models. ov.save_model(ov_model, ir_path) del ov_model print(f"VAE encoder successfully converted to IR and saved to {ir_path}") - - + + if not VAE_ENCODER_OV_PATH.exists(): convert_vae_encoder(vae, VAE_ENCODER_OV_PATH) else: print(f"VAE encoder will be loaded from {VAE_ENCODER_OV_PATH}") - + VAE_DECODER_OV_PATH = Path("vae_decoder.xml") - - + + def convert_vae_decoder(vae: torch.nn.Module, ir_path: Path): """ Convert VAE model for decoding to IR format. @@ -343,31 +343,31 @@ of the pipeline, it will be better to convert them to separate models. Returns: None """ - + class VAEDecoderWrapper(torch.nn.Module): def __init__(self, vae): super().__init__() self.vae = vae - + def forward(self, latents): return self.vae.decode(latents) - + vae_decoder = VAEDecoderWrapper(vae) latents = torch.zeros((1, 4, 64, 64)) - + vae_decoder.eval() with torch.no_grad(): ov_model = ov.convert_model(vae_decoder, example_input=latents, input=[((1, 4, 64, 64),)]) ov.save_model(ov_model, ir_path) del ov_model print(f"VAE decoder successfully converted to IR and saved to {ir_path}") - - + + if not VAE_DECODER_OV_PATH.exists(): convert_vae_decoder(vae, VAE_DECODER_OV_PATH) else: print(f"VAE decoder will be loaded from {VAE_DECODER_OV_PATH}") - + del vae gc.collect(); @@ -426,20 +426,20 @@ of the variational auto encoder. import inspect from typing import List, Optional, Union, Dict - + import PIL import cv2 - + from transformers import CLIPTokenizer from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler - - + + def scale_fit_to_window(dst_width: int, dst_height: int, image_width: int, image_height: int): """ Preprocessing helper function for calculating image size for resize with peserving original aspect ratio and fitting image to specific window size - + Parameters: dst_width (int): destination window width dst_height (int): destination window height @@ -451,15 +451,15 @@ of the variational auto encoder. """ im_scale = min(dst_height / image_height, dst_width / image_width) return int(im_scale * image_width), int(im_scale * image_height) - - + + def preprocess(image: PIL.Image.Image): """ Image preprocessing function. Takes image in PIL.Image format, resizes it to keep aspect ration and fits to model input window 512x512, then converts it to np.ndarray and adds padding with zeros on right or bottom side of image (depends from aspect ratio), after that converts data to float32 data type and change range of values from [0, 255] to [-1, 1], finally, converts data layout from planar NHWC to NCHW. The function returns preprocessed input tensor and padding size, which can be used in postprocessing. - + Parameters: image (PIL.Image.Image): input image Returns: @@ -477,8 +477,8 @@ of the variational auto encoder. image = 2.0 * image - 1.0 image = image.transpose(0, 3, 1, 2) return image, {"padding": pad, "src_width": src_width, "src_height": src_height} - - + + class OVStableDiffusionPipeline(DiffusionPipeline): def __init__( self, @@ -518,7 +518,7 @@ of the variational auto encoder. self.height = 512 self.width = 512 self.tokenizer = tokenizer - + def __call__( self, prompt: Union[str, List[str]], @@ -567,7 +567,7 @@ of the variational auto encoder. """ if seed is not None: np.random.seed(seed) - + img_buffer = [] do_classifier_free_guidance = guidance_scale > 1.0 # get prompt text embeddings @@ -576,20 +576,20 @@ of the variational auto encoder. do_classifier_free_guidance=do_classifier_free_guidance, negative_prompt=negative_prompt, ) - + # set timesteps accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) extra_set_kwargs = {} if accepts_offset: extra_set_kwargs["offset"] = 1 - + self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) latent_timestep = timesteps[:1] - + # get the initial random noise unless the user supplied it latents, meta = self.prepare_latents(image, latent_timestep) - + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 @@ -598,19 +598,19 @@ of the variational auto encoder. extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta - + for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if you are doing classifier free guidance latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - + # predict the noise residual noise_pred = self.unet([latent_model_input, t, text_embeddings])[self._unet_output] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step( torch.from_numpy(noise_pred), @@ -622,13 +622,13 @@ of the variational auto encoder. image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] image = self.postprocess_image(image, meta, output_type) img_buffer.extend(image) - + # scale and decode the image latents with vae image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] - + image = self.postprocess_image(image, meta, output_type) return {"sample": image, "iterations": img_buffer} - + def _encode_prompt( self, prompt: Union[str, List[str]], @@ -638,7 +638,7 @@ of the variational auto encoder. ): """ Encodes the prompt into text encoder hidden states. - + Parameters: prompt (str or list(str)): prompt to be encoded num_images_per_prompt (int): number of images that should be generated per prompt @@ -648,7 +648,7 @@ of the variational auto encoder. text_embeddings (np.ndarray): text encoder hidden states """ batch_size = len(prompt) if isinstance(prompt, list) else 1 - + # tokenize input prompts text_inputs = self.tokenizer( prompt, @@ -658,15 +658,15 @@ of the variational auto encoder. return_tensors="np", ) text_input_ids = text_inputs.input_ids - + text_embeddings = self.text_encoder(text_input_ids)[self._text_encoder_output] - + # duplicate text embeddings for each generation per prompt if num_images_per_prompt != 1: bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - + # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: uncond_tokens: List[str] @@ -684,25 +684,25 @@ of the variational auto encoder. truncation=True, return_tensors="np", ) - + uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self._text_encoder_output] - + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = uncond_embeddings.shape[1] uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - + # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - + return text_embeddings - + def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None): """ Function for getting initial latents for starting generation - + Parameters: image (PIL.Image.Image, *optional*, None): Input image for generation, if not provided randon noise will be used as starting point @@ -723,12 +723,12 @@ of the variational auto encoder. latents = self.vae_encoder(input_image)[self._vae_e_output] * 0.18215 latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() return latents, meta - + def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"): """ Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required), normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format - + Parameters: image (np.ndarray): Generated image @@ -760,12 +760,12 @@ of the variational auto encoder. orig_height, orig_width = meta["src_height"], meta["src_width"] image = [cv2.resize(img, (orig_width, orig_width)) for img in image] return image - + def get_timesteps(self, num_inference_steps: int, strength: float): """ Helper function for getting scheduler timesteps for generation In case of image-to-image generation, it updates number of steps according to strength - + Parameters: num_inference_steps (int): number of inference steps for generation @@ -775,10 +775,10 @@ of the variational auto encoder. """ # get the original timestep using init_timestep init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - + t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] - + return timesteps, num_inference_steps - t_start Configure Inference Pipeline @@ -797,16 +797,16 @@ Select device from dropdown list for running inference using OpenVINO. .. code:: ipython3 import requests - + r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) open("notebook_utils.py", "w").write(r.text) - + from notebook_utils import device_widget - + device = device_widget() - + device @@ -821,7 +821,7 @@ Select device from dropdown list for running inference using OpenVINO. .. code:: ipython3 core = ov.Core() - + text_enc = core.compile_model(TEXT_ENCODER_OV_PATH, device.value) Calibrate UNet for GPU inference @@ -839,28 +839,28 @@ operations to be executed in full precision. import pickle import requests import os - + # Fetch `model_upcast_utils` which helps to restore accuracy when inferred on GPU r = requests.get("https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/model_upcast_utils.py") with open("model_upcast_utils.py", "w") as f: f.write(r.text) - + # Fetch an example input for UNet model needed for upcasting calibration process r = requests.get("https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/pkl/unet_calibration_example_input.pkl") with open("unet_calibration_example_input.pkl", "wb") as f: f.write(r.content) - + from model_upcast_utils import ( is_model_partially_upcasted, partially_upcast_nodes_to_fp32, ) - + unet_model = core.read_model(UNET_OV_PATH) if "GPU" in core.available_devices and not is_model_partially_upcasted(unet_model): with open("unet_calibration_example_input.pkl", "rb") as f: example_input = pickle.load(f) unet_model = partially_upcast_nodes_to_fp32(unet_model, example_input, upcast_ratio=0.7, operation_types=["Convolution"]) - + ov.save_model(unet_model, UNET_OV_PATH.with_suffix("._tmp.xml")) del unet_model os.remove(UNET_OV_PATH) @@ -875,7 +875,7 @@ operations to be executed in full precision. .. code:: ipython3 ov_config = {"INFERENCE_PRECISION_HINT": "f32"} if device.value != "CPU" else {} - + vae_decoder = core.compile_model(VAE_DECODER_OV_PATH, device.value, ov_config) vae_encoder = core.compile_model(VAE_ENCODER_OV_PATH, device.value, ov_config) @@ -886,10 +886,10 @@ Let us define them and put all components together from transformers import CLIPTokenizer from diffusers.schedulers import LMSDiscreteScheduler - + lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - + ov_pipe = OVStableDiffusionPipeline( tokenizer=tokenizer, text_encoder=text_enc, @@ -923,7 +923,7 @@ Now, let’s see model in action .. parsed-literal:: Pipeline settings - Input text: RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors + Input text: RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors Seed: 431 Number of steps: 20 @@ -961,7 +961,7 @@ Now is show time! .. parsed-literal:: Input text: - RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors + RAW studio photo of An intricate forest minitown landscape trapped in a bottle, atmospheric oliva lighting, on the table, intricate details, dark shot, soothing tones, muted colors @@ -1014,10 +1014,11 @@ found in this .. code:: ipython3 from diffusers.utils import load_image - + default_image_url = "https://user-images.githubusercontent.com/29454499/260418860-69cc443a-9ee6-493c-a393-3a97af080be7.jpg" + image_name = "default.jpg" # read uploaded image - image = load_image(default_image_url) + image = load_image(image_name, default_image_url) print("Pipeline settings") print(f"Input positive prompt: \n\t{text_prompt_i2i}") print(f"Input negative prompt: \n\t{negative_prompt_i2i}") @@ -1039,9 +1040,9 @@ found in this .. parsed-literal:: Pipeline settings - Input positive prompt: + Input positive prompt: professional photo portrait of woman, highly detailed, hyper realistic, cinematic effects, soft lighting - Input negative prompt: + Input negative prompt: blurry, poor quality, low res, worst quality, cropped, ugly, poorly drawn face, without eyes, mutation, unreal, animate, poorly drawn eyes Seed: 82698152 Number of steps: 40 @@ -1090,13 +1091,13 @@ Interactive Demo .. code:: ipython3 import gradio as gr - - + + def generate_from_text(text, negative_text, seed, num_steps, _=gr.Progress(track_tqdm=True)): result = ov_pipe(text, negative_prompt=negative_text, num_inference_steps=num_steps, seed=seed) return result["sample"][0] - - + + def generate_from_image(img, text, negative_text, seed, num_steps, strength, _=gr.Progress(track_tqdm=True)): result = ov_pipe( text, @@ -1113,11 +1114,11 @@ Interactive Demo if not Path("gradio_helper.py").exists(): r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/tiny-sd-image-generation/gradio_helper.py") open("gradio_helper.py", "w").write(r.text) - + from gradio_helper import make_demo - + demo = make_demo(text_to_text_fn=generate_from_text, image_to_image_fn=generate_from_image) - + try: demo.queue().launch(debug=False) except Exception: diff --git a/docs/notebooks/vehicle-detection-and-recognition-with-output.rst b/docs/notebooks/vehicle-detection-and-recognition-with-output.rst index 30204d6bd11ded..fab72fb0725d03 100644 --- a/docs/notebooks/vehicle-detection-and-recognition-with-output.rst +++ b/docs/notebooks/vehicle-detection-and-recognition-with-output.rst @@ -152,25 +152,25 @@ model is already downloaded, this step is skipped. .. parsed-literal:: - model/vehicle-detection-0200.xml: 0%| | 0.00/181k [00:00 - 100%|██████████| 4.68M/4.68M [00:00<00:00, 34.1MB/s] + 100%|██████████| 4.68M/4.68M [00:00<00:00, 17.2MB/s] .. parsed-literal:: @@ -215,13 +215,13 @@ next cell loads the model and the pre-trained weights. .. parsed-literal:: - Loading model weights from: 'model/u2net_lite/u2net_lite.pth' + /tmp/ipykernel_2254056/1036642300.py:7: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + net.load_state_dict(state_dict=torch.load(model_path, map_location="cpu")) .. parsed-literal:: - /tmp/ipykernel_3590581/1036642300.py:7: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. - net.load_state_dict(state_dict=torch.load(model_path, map_location="cpu")) + Loading model weights from: 'model/u2net_lite/u2net_lite.pth' @@ -247,7 +247,7 @@ OpenVINO IR format. Executing the following command may take a while. .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/vision-background-removal/model/u2net.py:23: UserWarning: `nn.functional.upsample` is deprecated. Use `nn.functional.interpolate` instead. + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/vision-background-removal/model/u2net.py:23: UserWarning: `nn.functional.upsample` is deprecated. Use `nn.functional.interpolate` instead. src = F.upsample(src,size=tar.shape[2:],mode='bilinear') @@ -273,12 +273,13 @@ repository `__ and multiplied by .. code:: ipython3 IMAGE_URI = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_hollywood.jpg" + IMAGE_NAME = "coco_hollywood.jpg" input_mean = np.array([123.675, 116.28, 103.53]).reshape(1, 3, 1, 1) input_scale = np.array([58.395, 57.12, 57.375]).reshape(1, 3, 1, 1) image = cv2.cvtColor( - src=load_image(IMAGE_URI), + src=load_image(IMAGE_NAME, IMAGE_URI), code=cv2.COLOR_BGR2RGB, ) @@ -336,7 +337,7 @@ Load the OpenVINO IR model to OpenVINO Runtime and do inference. .. parsed-literal:: - Inference finished. Inference time: 0.107 seconds, FPS: 9.35. + Inference finished. Inference time: 0.109 seconds, FPS: 9.19. Visualize Results @@ -389,12 +390,13 @@ background pixels a value of 0. Replace the background image as follows: .. code:: ipython3 - BACKGROUND_FILE = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/wall.jpg" + BACKGROUND_IMAGE_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/wall.jpg" + BACKGROUND_IMAGE_NAME = "wall.jpg" OUTPUT_DIR = "output" os.makedirs(name=OUTPUT_DIR, exist_ok=True) - background_image = cv2.cvtColor(src=load_image(BACKGROUND_FILE), code=cv2.COLOR_BGR2RGB) + background_image = cv2.cvtColor(src=load_image(BACKGROUND_IMAGE_NAME, BACKGROUND_IMAGE_URL), code=cv2.COLOR_BGR2RGB) background_image = cv2.resize(src=background_image, dsize=(image.shape[1], image.shape[0])) # Set all the foreground pixels from the result to 0 @@ -403,7 +405,7 @@ background pixels a value of 0. Replace the background image as follows: new_image = background_image + bg_removed_result # Save the generated image. - new_image_path = Path(f"{OUTPUT_DIR}/{Path(IMAGE_URI).stem}-{Path(BACKGROUND_FILE).stem}.jpg") + new_image_path = Path(f"{OUTPUT_DIR}/{Path(IMAGE_URI).stem}-{BACKGROUND_IMAGE_NAME}") cv2.imwrite(filename=str(new_image_path), img=cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)) # Display the original image and the image with the new background side by side diff --git a/docs/notebooks/vision-monodepth-with-output.rst b/docs/notebooks/vision-monodepth-with-output.rst index 736e0238989f8d..cf232c30fc9c73 100644 --- a/docs/notebooks/vision-monodepth-with-output.rst +++ b/docs/notebooks/vision-monodepth-with-output.rst @@ -146,11 +146,8 @@ format. ir_model_name_xml = "MiDaS_small.xml" ir_model_name_bin = "MiDaS_small.bin" - - if not (model_folder / ir_model_name_xml).exists(): - download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory=model_folder) - if not (model_folder / ir_model_name_bin).exists(): - download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory=model_folder) + download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory=model_folder) + download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory=model_folder) model_xml_path = model_folder / ir_model_name_xml @@ -158,13 +155,13 @@ format. .. parsed-literal:: - model/MiDaS_small.xml: 0%| | 0.00/268k [00:00 #0:0 (mpeg4 (native) -> h264 (libx264)) Stream #0:0 -> #0:1 (pcm_s16le (native) -> aac (native)) Press [q] to stop, [?] for help - [libx264 @ 0x556392e25840] -qscale is ignored, -crf is recommended. - [libx264 @ 0x556392e25840] using SAR=1/1 - [libx264 @ 0x556392e25840] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2 AVX512 - [libx264 @ 0x556392e25840] profile High, level 3.1 - [libx264 @ 0x556392e25840] 264 - core 155 r2917 0a84d98 - H.264/MPEG-4 AVC codec - Copyleft 2003-2018 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=24 lookahead_threads=4 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00 + [libx264 @ 0x55ec6513e840] -qscale is ignored, -crf is recommended. + [libx264 @ 0x55ec6513e840] using SAR=1/1 + [libx264 @ 0x55ec6513e840] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2 AVX512 + [libx264 @ 0x55ec6513e840] profile High, level 3.1 + [libx264 @ 0x55ec6513e840] 264 - core 155 r2917 0a84d98 - H.264/MPEG-4 AVC codec - Copyleft 2003-2018 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=24 lookahead_threads=4 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00 Output #0, mp4, to 'results/result_voice.mp4': Metadata: encoder : Lavf58.29.100 @@ -349,27 +349,27 @@ python API and converted OpenVINO models. Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 44100 Hz, mono, fltp, 69 kb/s Metadata: encoder : Lavc58.54.100 aac - frame= 123 fps=0.0 q=-1.0 Lsize= 621kB time=00:00:05.06 bitrate=1005.8kbits/s speed=9.73x + frame= 123 fps=0.0 q=-1.0 Lsize= 621kB time=00:00:05.06 bitrate=1005.8kbits/s speed=10.6x video:573kB audio:43kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.827166% - [libx264 @ 0x556392e25840] frame I:1 Avg QP:22.24 size: 31028 - [libx264 @ 0x556392e25840] frame P:75 Avg QP:22.01 size: 6954 - [libx264 @ 0x556392e25840] frame B:47 Avg QP:25.58 size: 718 - [libx264 @ 0x556392e25840] consecutive B-frames: 38.2% 27.6% 14.6% 19.5% - [libx264 @ 0x556392e25840] mb I I16..4: 14.0% 83.9% 2.1% - [libx264 @ 0x556392e25840] mb P I16..4: 1.3% 3.3% 0.1% P16..4: 37.8% 8.2% 6.4% 0.0% 0.0% skip:43.0% - [libx264 @ 0x556392e25840] mb B I16..4: 0.2% 0.7% 0.0% B16..8: 27.9% 0.4% 0.1% direct: 0.2% skip:70.6% L0:43.9% L1:54.2% BI: 1.9% - [libx264 @ 0x556392e25840] 8x8 transform intra:73.3% inter:77.1% - [libx264 @ 0x556392e25840] coded y,uvDC,uvAC intra: 56.9% 72.4% 8.1% inter: 11.4% 13.0% 0.2% - [libx264 @ 0x556392e25840] i16 v,h,dc,p: 20% 23% 9% 48% - [libx264 @ 0x556392e25840] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 25% 23% 36% 3% 3% 2% 2% 3% 3% - [libx264 @ 0x556392e25840] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 39% 14% 14% 4% 6% 7% 4% 9% 3% - [libx264 @ 0x556392e25840] i8c dc,h,v,p: 42% 25% 29% 4% - [libx264 @ 0x556392e25840] Weighted P-Frames: Y:0.0% UV:0.0% - [libx264 @ 0x556392e25840] ref P L0: 74.2% 10.4% 11.1% 4.3% - [libx264 @ 0x556392e25840] ref B L0: 86.1% 11.2% 2.8% - [libx264 @ 0x556392e25840] ref B L1: 98.3% 1.7% - [libx264 @ 0x556392e25840] kb/s:953.36 - [aac @ 0x556392e27140] Qavg: 121.673 + [libx264 @ 0x55ec6513e840] frame I:1 Avg QP:22.24 size: 31028 + [libx264 @ 0x55ec6513e840] frame P:75 Avg QP:22.01 size: 6954 + [libx264 @ 0x55ec6513e840] frame B:47 Avg QP:25.58 size: 718 + [libx264 @ 0x55ec6513e840] consecutive B-frames: 38.2% 27.6% 14.6% 19.5% + [libx264 @ 0x55ec6513e840] mb I I16..4: 14.0% 83.9% 2.1% + [libx264 @ 0x55ec6513e840] mb P I16..4: 1.3% 3.3% 0.1% P16..4: 37.8% 8.2% 6.4% 0.0% 0.0% skip:43.0% + [libx264 @ 0x55ec6513e840] mb B I16..4: 0.2% 0.7% 0.0% B16..8: 27.9% 0.4% 0.1% direct: 0.2% skip:70.6% L0:43.9% L1:54.2% BI: 1.9% + [libx264 @ 0x55ec6513e840] 8x8 transform intra:73.3% inter:77.1% + [libx264 @ 0x55ec6513e840] coded y,uvDC,uvAC intra: 56.9% 72.4% 8.1% inter: 11.4% 13.0% 0.2% + [libx264 @ 0x55ec6513e840] i16 v,h,dc,p: 20% 23% 9% 48% + [libx264 @ 0x55ec6513e840] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 25% 23% 36% 3% 3% 2% 2% 3% 3% + [libx264 @ 0x55ec6513e840] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 39% 14% 14% 4% 6% 7% 4% 9% 3% + [libx264 @ 0x55ec6513e840] i8c dc,h,v,p: 42% 25% 29% 4% + [libx264 @ 0x55ec6513e840] Weighted P-Frames: Y:0.0% UV:0.0% + [libx264 @ 0x55ec6513e840] ref P L0: 74.2% 10.4% 11.1% 4.3% + [libx264 @ 0x55ec6513e840] ref B L0: 86.1% 11.2% 2.8% + [libx264 @ 0x55ec6513e840] ref B L1: 98.3% 1.7% + [libx264 @ 0x55ec6513e840] kb/s:953.36 + [aac @ 0x55ec65140140] Qavg: 121.673 diff --git a/docs/notebooks/whisper-subtitles-generation-with-output.rst b/docs/notebooks/whisper-subtitles-generation-with-output.rst index 5a3c677fc27daa..a2764b4622bf67 100644 --- a/docs/notebooks/whisper-subtitles-generation-with-output.rst +++ b/docs/notebooks/whisper-subtitles-generation-with-output.rst @@ -79,15 +79,27 @@ Install dependencies. .. code:: ipython3 import platform + import importlib.metadata + import importlib.util %pip install -q "nncf>=2.14.0" %pip install -q -U "openvino>=2024.5.0" "openvino-tokenizers>=2024.5.0" "openvino-genai>=2024.5.0" %pip install -q "python-ffmpeg<=1.0.16" "ffmpeg" "moviepy" "transformers>=4.45" "git+https://github.com/huggingface/optimum-intel.git" "torch>=2.1" --extra-index-url https://download.pytorch.org/whl/cpu - %pip install -q -U "yt_dlp>=2024.8.6" soundfile librosa jiwer + %pip install -q -U "yt_dlp>=2024.8.6" soundfile librosa jiwer packaging %pip install -q "gradio>=4.19" "typing_extensions>=4.9" if platform.system() == "Darwin": %pip install -q "numpy<2.0" + + + from packaging import version + + if ( + importlib.util.find_spec("tensorflow") is not None + and version.parse(importlib.metadata.version("tensorflow")) < version.parse("2.18.0") + and version.parse(importlib.metadata.version("numpy")) >= version.parse("2.0.0") + ): + %pip uninstall -q -y tensorflow .. code:: ipython3 @@ -312,7 +324,10 @@ Select the task for the model: .. code:: ipython3 - from moviepy.editor import VideoFileClip + try: + from moviepy import VideoFileClip + except ImportError: + from moviepy.editor import VideoFileClip from transformers.pipelines.audio_utils import ffmpeg_read diff --git a/docs/notebooks/yolov10-optimization-with-output.rst b/docs/notebooks/yolov10-optimization-with-output.rst index dd7106d7a6bb56..99674518b41948 100644 --- a/docs/notebooks/yolov10-optimization-with-output.rst +++ b/docs/notebooks/yolov10-optimization-with-output.rst @@ -107,17 +107,6 @@ Prerequisites %pip install -q "git+https://github.com/THU-MIG/yolov10.git" --extra-index-url https://download.pytorch.org/whl/cpu %pip install -q "torch>=2.1" "torchvision>=0.16" tqdm opencv-python "gradio>=4.19" --extra-index-url https://download.pytorch.org/whl/cpu - -.. parsed-literal:: - - WARNING: Skipping openvino as it is not installed. - WARNING: Skipping openvino-dev as it is not installed. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - Note: you may need to restart the kernel to use updated packages. - - .. code:: ipython3 from pathlib import Path diff --git a/docs/notebooks/yolov11-instance-segmentation-with-output.rst b/docs/notebooks/yolov11-instance-segmentation-with-output.rst index 6c71d614e519db..0c346bf08cf3a6 100644 --- a/docs/notebooks/yolov11-instance-segmentation-with-output.rst +++ b/docs/notebooks/yolov11-instance-segmentation-with-output.rst @@ -128,18 +128,25 @@ Import required utility functions. The lower cell will download the # Download a test sample IMAGE_PATH = Path("./data/coco_bike.jpg") - if not IMAGE_PATH.exists(): - download_file( - url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", - filename=IMAGE_PATH.name, - directory=IMAGE_PATH.parent, - ) + download_file( + url="https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg", + filename=IMAGE_PATH.name, + directory=IMAGE_PATH.parent, + ) + + + +.. parsed-literal:: + + coco_bike.jpg: 0%| | 0.00/182k [00:00=2023.3.0" "nncf>=2.8.1" "opencv-python" "matplotlib>=3.4" "seaborn" "pandas" "scikit-learn" "torch" "torchvision" "tqdm" --extra-index-url https://download.pytorch.org/whl/cpu @@ -97,7 +98,7 @@ Prerequisites .. parsed-literal:: - /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/823/archive/.workspace/scm/ov-notebook/notebooks/yolov9-optimization/yolov9 + /opt/home/k8sworker/ci-ai/cibuilds/jobs/ov-notebook/jobs/OVNotebookOps/builds/835/archive/.workspace/scm/ov-notebook/notebooks/yolov9-optimization/yolov9 Get PyTorch model @@ -128,14 +129,14 @@ applicable for other models from YOLO V9 family. .. parsed-literal:: - model/gelan-c.pt: 0%| | 0.00/49.1M [00:00Graph Results
- + + \ No newline at end of file diff --git a/docs/sphinx_setup/_static/html/modalLLM.html b/docs/sphinx_setup/_static/html/modalLLM.html index 37b569d0bd4078..e8535c87f16090 100644 --- a/docs/sphinx_setup/_static/html/modalLLM.html +++ b/docs/sphinx_setup/_static/html/modalLLM.html @@ -87,6 +87,6 @@

Graph Results

- + + \ No newline at end of file diff --git a/docs/sphinx_setup/_static/js/custom.js b/docs/sphinx_setup/_static/js/custom.js index 241f8895ee1c61..95f9549959e102 100644 --- a/docs/sphinx_setup/_static/js/custom.js +++ b/docs/sphinx_setup/_static/js/custom.js @@ -189,7 +189,7 @@ function getCurrentVersion() { if (wordAfterDomain === 'cn') { wordAfterDomain = link[2]; } - if (["index.html", "404.html", "", "latest"].indexOf(wordAfterDomain) >= 0) { + if (["index.html", "404.html", ""].indexOf(wordAfterDomain) >= 0) { /* * If this landing page, 404 or domain.com we should get first version * */ @@ -426,7 +426,7 @@ document.addEventListener('DOMContentLoaded', function () { const searchInterfaceSa = document.querySelector("#sa-search"); const searchInterface = document.querySelector("#search"); const currentVersion = getCurrentVersion(); - + await initializeSearchInterface(searchInterfaceSa, currentVersion); await initializeSearchInterface(searchInterface); diff --git a/docs/sphinx_setup/_static/js/graphs.js b/docs/sphinx_setup/_static/js/graphs.js index 697911bad9402c..04e34d6c2fefe5 100644 --- a/docs/sphinx_setup/_static/js/graphs.js +++ b/docs/sphinx_setup/_static/js/graphs.js @@ -60,8 +60,8 @@ class Filter { // param: GraphData[], clientPlatforms[] static BySortPlatforms(graphDataArr, platformsArr) { return graphDataArr - .filter((data) => platformsArr.includes(data.Platform)) - .sort((a, b) => a.Platform.localeCompare(b.Platform)); + .filter((data) => platformsArr.includes(data.Platform)) + .sort((a, b) => a.Platform.localeCompare(b.Platform)); //sort is necessary } } @@ -145,8 +145,8 @@ class Graph { array.push([obj]) } }) - return array; + return array; } // this returns an object that is used to ender the chart @@ -283,13 +283,13 @@ $(document).ready(function () { const models = networkModels.map((networkModel) => createCheckMark(networkModel, 'networkmodel')); modal.find('.models-column').append(models); - const selectAllModelsButton = createCheckMark('', 'networkmodel', false , false); + const selectAllModelsButton = createCheckMark('', 'networkmodel', false, false); modal.find('.models-selectall').append(selectAllModelsButton); - const selectAllPlatformsButton = createCheckMark('', 'platform', false , false); + const selectAllPlatformsButton = createCheckMark('', 'platform', false, false); modal.find('.platforms-selectall').append(selectAllPlatformsButton); - const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false , false)); + const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false, false)); modal.find('.precisions-column').append(precisions); selectAllCheckboxes(precisions); @@ -304,7 +304,7 @@ $(document).ready(function () { modal.find('#modal-display-graphs').hide(); modal.find('.ietype-column input').first().prop('checked', true); - const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false , true)); + const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false, true)); modal.find('.kpi-column').append(kpiLabels); $('body').prepend(modal); @@ -511,6 +511,7 @@ $(document).ready(function () { listContainer.style.margin = 0; listContainer.style.padding = 0; listContainer.style.paddingLeft = '0px'; + listContainer.style.float = "right"; legendContainer.appendChild(listContainer); } @@ -521,57 +522,55 @@ $(document).ready(function () { const htmlLegendPlugin = { id: 'htmlLegend', afterUpdate(chart, args, options) { - + charts = [...new Set([...charts, ...[chart]])]; const ul = getOrCreateLegendList(chart, chart.options.plugins.htmlLegend.containerID); - // Remove old legend items while (ul.firstChild) { ul.firstChild.remove(); } - const items = chart.legend.legendItems; + const items = chart.options.plugins.legend.labels.generateLabels(chart); items.forEach(item => { const li = document.createElement('li'); li.style.alignItems = 'center'; li.style.display = 'block'; li.style.flexDirection = 'column'; - li.style.marginLeft = '4px'; - + li.style.marginLeft = '6px'; + li.style.cursor = "pointer"; + li.style.fontSize = '0.6rem'; + li.style.textDecoration = item.hidden ? 'line-through' : ''; li.onclick = () => { - chart.toggleDataVisibility(item.index); - chart.update(); + charts.forEach((chartItem) => { + chartItem.setDatasetVisibility(item.datasetIndex, !chartItem.isDatasetVisible(item.datasetIndex)); + chartItem.update(); + }) }; - - // Color box + const boxSpan = document.createElement('span'); boxSpan.style.background = item.fillStyle; boxSpan.style.borderColor = item.strokeStyle; - boxSpan.style.borderWidth = item.lineWidth + 'px'; boxSpan.style.display = 'inline-block'; boxSpan.style.height = '10px'; boxSpan.style.marginRight = '4px'; boxSpan.style.width = '30px'; - // Text - const textContainer = document.createElement('p'); - textContainer.style.color = '#666'; - textContainer.style.margin = 0; - textContainer.style.padding = 0; - textContainer.style.fontSize = '0.6rem'; - textContainer.style.marginLeft = '3px'; - textContainer.style.textDecoration = item.hidden ? 'line-through' : ''; + const textSpan = document.createElement('span'); + textSpan.style.bottom = '1px' + textSpan.style.position = 'relative' + textSpan.style.fontSize = '0.6rem'; + textSpan.style.textDecoration = item.hidden ? 'line-through' : ''; const text = document.createTextNode(item.text); - textContainer.appendChild(text); + textSpan.appendChild(text); li.appendChild(boxSpan); - li.appendChild(textContainer); + li.appendChild(textSpan); ul.appendChild(li); }); } }; - function getChartOptionsByEngines(containerId, allowedAxisIDs) { + function getChartOptionsByEngines(allowedAxisIDs) { const axisConfigs = { x: { title: { display: true, text: 'Request Rate' } @@ -602,11 +601,11 @@ $(document).ready(function () { }, {}), plugins: { legend: { display: false }, - htmlLegend: { containerID: containerId } + htmlLegend: { containerID: 'modal-footer' } } }; } - function getChartOptions(title, containerId) { + function getChartOptions(title) { return { responsive: true, indexAxis: 'y', @@ -633,7 +632,7 @@ $(document).ready(function () { display: false }, htmlLegend: { - containerID: containerId, + containerID: 'modal-footer', } } } @@ -838,7 +837,7 @@ $(document).ready(function () { new Chart(context, { type: 'bar', data: getChartData(labels, datasets), - options: getChartOptions(chartTitle, containerId), + options: getChartOptions(chartTitle), plugins: [htmlLegendPlugin] }); }); @@ -858,9 +857,9 @@ $(document).ready(function () { }) } } - + var charts = []; function processMetricByEngines(labels, datasets, container, widthClass, id) { - var heightRatio = (80 + (labels.length * 55)); + var heightRatio = (30 + (labels.length * 55)); var chart = $('
'); const containerId = `legend-container-${id}`; const legend = $(`
`); @@ -894,8 +893,7 @@ $(document).ready(function () { backgroundColor: precision.color, yAxisID: precision.label === "Throughput" ? 'y' : 'y1', fill: false - } - ) + }) }) }) @@ -914,9 +912,10 @@ $(document).ready(function () { labels: labels, datasets: graphDatas }, - options: getChartOptionsByEngines(containerId, allowedAxisIDs), + options: getChartOptionsByEngines(allowedAxisIDs), plugins: [htmlLegendPlugin] }); + }); } diff --git a/docs/sphinx_setup/_templates/layout.html b/docs/sphinx_setup/_templates/layout.html index 7f873c679c2e83..0d2331b2c83fe3 100644 --- a/docs/sphinx_setup/_templates/layout.html +++ b/docs/sphinx_setup/_templates/layout.html @@ -11,7 +11,7 @@ - + diff --git a/docs/sphinx_setup/api/nodejs_api/addon.rst b/docs/sphinx_setup/api/nodejs_api/addon.rst index f6ee4ab7b15836..7c42824bcd88a3 100644 --- a/docs/sphinx_setup/api/nodejs_api/addon.rst +++ b/docs/sphinx_setup/api/nodejs_api/addon.rst @@ -49,6 +49,7 @@ The **openvino-node** package exports ``addon`` which contains the following pro resizeAlgorithm: typeof resizeAlgorithm; PrePostProcessor: PrePostProcessorConstructor; }; + saveModelSync(model: Model, path: string, compressToFp16?: boolean): void; element: typeof element; } @@ -142,3 +143,39 @@ Properties - **Defined in:** `addon.ts:674 `__ + +.. rubric:: saveModelSync + +* + + .. code-block:: ts + + saveModelSync(model: Model, path: string, compressToFp16?: boolean): void; + + + This method saves a model to IR (xml and bin files), applying all + necessary transformations that are usually added during model conversion. + Particularly, weights are compressed to FP16 by default, and debug information + in model nodes is cleaned up. + + * **Parameters:** + + - model: :doc:`Model ` + + A model which will be converted to IR and saved. + + - path: string + + A path for saving the model. + + - ``Optional`` + + - compressToFp16: boolean + + Compression of weights to FP16 floating point precision. The default value is `true` . + + * **Returns:** void + + * **Defined in:** + `addon.ts:692 `__ + diff --git a/docs/sphinx_setup/assets/versions_raw.js b/docs/sphinx_setup/assets/versions_raw.js index 8045057450bf5f..8b9457135836d5 100644 --- a/docs/sphinx_setup/assets/versions_raw.js +++ b/docs/sphinx_setup/assets/versions_raw.js @@ -1 +1 @@ -var data='[{"version": "2024"}, {"version": "2023.3"}, {"version": "2022.3"}, {"version": "nightly"}, {"version": "archives"}]'; \ No newline at end of file +var data='[{"version": "2024"}, {"version": "2023.3"}, {"version": "nightly"}, {"version": "archives"}]'; diff --git a/docs/sphinx_setup/index.rst b/docs/sphinx_setup/index.rst index 2e6f960468015f..ad98be58cde1cd 100644 --- a/docs/sphinx_setup/index.rst +++ b/docs/sphinx_setup/index.rst @@ -11,8 +11,8 @@ generative AI, video, audio, and language with models from popular frameworks li TensorFlow, ONNX, and more. Convert and optimize models, and deploy across a mix of Intel® hardware and environments, on-premises and on-device, in the browser or in the cloud. -Check out the `OpenVINO Cheat Sheet. `__ - +| Check out the `OpenVINO Cheat Sheet [PDF] `__ +| Check out the `GenAI Quick-start Guide [PDF] `__ .. container:: diff --git a/samples/python/benchmark/bert_benchmark/bert_benchmark.py b/samples/python/benchmark/bert_benchmark/bert_benchmark.py index f81eac29387182..11055d7a70b163 100755 --- a/samples/python/benchmark/bert_benchmark/bert_benchmark.py +++ b/samples/python/benchmark/bert_benchmark/bert_benchmark.py @@ -9,8 +9,8 @@ import tempfile from time import perf_counter -import datasets import openvino as ov +import datasets from openvino.runtime import get_version from transformers import AutoTokenizer from transformers.onnx import export diff --git a/src/bindings/js/node/src/tensor.cpp b/src/bindings/js/node/src/tensor.cpp index 78785a10b2275b..14a68286e5372c 100644 --- a/src/bindings/js/node/src/tensor.cpp +++ b/src/bindings/js/node/src/tensor.cpp @@ -66,6 +66,12 @@ Napi::Object TensorWrap::wrap(Napi::Env env, ov::Tensor tensor) { } Napi::Value TensorWrap::get_data(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + if (info.Length() > 0) { + reportError(env, "getData() does not accept any arguments."); + return env.Undefined(); + } + auto type = _tensor.get_element_type(); switch (type) { diff --git a/src/bindings/js/node/tests/unit/tensor.test.js b/src/bindings/js/node/tests/unit/tensor.test.js index 656bdf9b122e58..2cd5115b5af2a8 100644 --- a/src/bindings/js/node/tests/unit/tensor.test.js +++ b/src/bindings/js/node/tests/unit/tensor.test.js @@ -121,6 +121,12 @@ describe('ov.Tensor tests', () => { assert.deepStrictEqual(tensor.getData(), data); }); + it('getData should throw an error if arguments are provided', () => { + const tensor = new ov.Tensor(ov.element.f32, shape, data); + assert.throws(() => tensor.getData(1), { + message: 'getData() does not accept any arguments.', + }); + }); it('test tensor.data setter - different element type throws', () => { const float64Data = Float64Array.from([1, 2, 3]); const tensor = new ov.Tensor(ov.element.f32, [1, 3]); diff --git a/src/bindings/python/constraints.txt b/src/bindings/python/constraints.txt index a0fbf982105ad6..cc1d4514b7bbfe 100644 --- a/src/bindings/python/constraints.txt +++ b/src/bindings/python/constraints.txt @@ -10,10 +10,11 @@ pytest-timeout==2.3.1 # Python bindings build<1.3 pygments>=2.8.1 -setuptools>=65.6.1,<75.3.0 +setuptools>=70.1,<75.6 sympy>=1.10 wheel>=0.38.1 patchelf<=0.17.2.1 +packaging>=22.0 # Frontends h5py>=3.1.0,<3.13.0 diff --git a/src/bindings/python/setup.cfg b/src/bindings/python/setup.cfg index 7fc407ad56e7a1..89fabcb659c108 100644 --- a/src/bindings/python/setup.cfg +++ b/src/bindings/python/setup.cfg @@ -66,7 +66,7 @@ show_source = True docstring-convention = google enable-extensions = G per-file-ignores = - src/openvino/runtime/*/ops.py: VNE001,VNE003 + src/openvino/*/ops.py: VNE001,VNE003 src/openvino/preprocess/torchvision/*: N801, VNE001 *__init__.py: F401 diff --git a/src/bindings/python/src/openvino/__init__.py b/src/bindings/python/src/openvino/__init__.py index 57f03f00c2eebf..e4d1a247520332 100644 --- a/src/bindings/python/src/openvino/__init__.py +++ b/src/bindings/python/src/openvino/__init__.py @@ -56,6 +56,24 @@ from openvino._pyopenvino import RemoteTensor from openvino._pyopenvino import Op +# Import opsets +from openvino import opset1 +from openvino import opset2 +from openvino import opset3 +from openvino import opset4 +from openvino import opset5 +from openvino import opset6 +from openvino import opset7 +from openvino import opset8 +from openvino import opset9 +from openvino import opset10 +from openvino import opset11 +from openvino import opset12 +from openvino import opset13 +from openvino import opset14 +from openvino import opset15 +from openvino import opset16 + # libva related: from openvino._pyopenvino import VAContext from openvino._pyopenvino import VASurfaceTensor diff --git a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py index a1c6aecc45d421..60a48c275d6681 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py @@ -177,15 +177,3 @@ def unpatch_model(model): log.warning("Exception raised during GPTQ model unpatching. " "Depending on the exact issue it may lead to broken " "original model.\n%s", error) - - -def detect_gptq_model_raw(model): - return (model and getattr(model, 'config', None) and - getattr(model.config, 'quantization_config', None) and - model.config.quantization_config.quant_method == 'gptq') - - -def detect_gptq_model(model): - return (detect_gptq_model_raw(model) or - getattr(model, 'model', None) and - detect_gptq_model_raw(model.model)) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/quantized.py b/src/bindings/python/src/openvino/frontend/pytorch/quantized.py new file mode 100644 index 00000000000000..310e95cb9985d7 --- /dev/null +++ b/src/bindings/python/src/openvino/frontend/pytorch/quantized.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional +import torch +from openvino.frontend.pytorch import ModuleExtension, gptq +from openvino.frontend.pytorch.patch_model import patch_model, unpatch_model + + +def detect_quantized_model(model: torch.nn.Module) -> Optional[str]: + """Detects the quantization method used in a given PyTorch model. + + Args: + model (torch.nn.Module): The PyTorch model to check for quantization. + + Returns: + str: The quantization method if available, otherwise None. + """ + if (model and getattr(model, "config", None) + and getattr(model.config, "quantization_config", None)): + return model.config.quantization_config.quant_method + if getattr(model, "model", None): + return detect_quantized_model(model.model) + return None + + +def patch_quantized(model: torch.nn.Module) -> None: + """Patches a model based on its quantization type ("awq" or "gptq"). + + Args: + model (torch.nn.Module): The model to patch. + + Raises: + RuntimeError: If the quantization type is unknown. + """ + quant_type = detect_quantized_model(model) + if quant_type == "awq": + extensions = {} + try: + from awq.modules.linear import WQLinear_GEMM + extensions[WQLinear_GEMM] = ModuleExtension( + WQLinear_GEMM, "ov_ext::awq_gemm", + convert=lambda module, target_op, *args, **kwargs: target_op( + args[0], module.qweight, module.qzeros, module.scales, + torch.tensor(module.group_size), + torch.tensor(module.w_bit), module.bias), + evaluate=lambda module, *args, **kwargs: torch.full( + list(args[0].shape[:-1]) + [module.out_features], 0.5, + dtype=torch.float32)) # type: ignore + except ImportError: + pass + patch_model(model, extensions, + "_openvino_quantized_patch_orig_forward") # type: ignore + elif quant_type == "gptq": + model._openvino_gptq_patched = True + gptq.patch_model(model) # type: ignore + else: + raise RuntimeError(f"Unknown quantization type: {quant_type}.") + + +def unpatch_quantized(model: torch.nn.Module) -> None: + """Reverts the patching applied to a quantized PyTorch model. + + Args: + model (torch.nn.Module): The model to unpatch. + """ + if getattr(model, "_openvino_gptq_patched", False): + gptq.unpatch_model(model) # type: ignore + del model._openvino_gptq_patched + else: + unpatch_model(model, + "_openvino_quantized_patch_orig_forward") # type: ignore diff --git a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py index eb32a0a93c669b..6d8fdb1658793e 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py @@ -16,7 +16,7 @@ graph_has_ops, ) from openvino.runtime import opset11 as ops -from openvino.frontend.pytorch import gptq, patch_model +from openvino.frontend.pytorch import quantized, patch_model from openvino.frontend.pytorch.module_extension import ModuleExtension import inspect @@ -141,27 +141,25 @@ def _get_scripted_model(self, pt_module, example_inputs=None, skip_freeze=False) patch_model.patch_model( pt_module, self.module_extensions, orig_forward_name) - gptq_patched = False - if gptq.detect_gptq_model(pt_module): + patched = False + if quantized.detect_quantized_model(pt_module) is not None: try: - gptq.patch_model(pt_module) - gptq_patched = True + quantized.patch_quantized(pt_module) + patched = True except Exception as error: log.warning( - "Failed patching of AutoGPTQ model. Error message:\n%s" - "\nTracing of the model will likely be unsuccessful or incorrect", - error) - gptq.unpatch_model(pt_module) - gptq_patched = False + "Failed patching of AutoGPTQ model. Error message:\n" + "Tracing of the model will likely be unsuccessful or incorrect", + exc_info=error) + quantized.unpatch_quantized(pt_module) + patched = False try: scripted = torch.jit.trace( pt_module, **input_parameters, strict=False) finally: - if gptq_patched: - gptq.unpatch_model(pt_module) - if self.module_extensions: - patch_model.unpatch_model(pt_module, orig_forward_name) + if patched: + quantized.unpatch_quantized(pt_module) have_to_freeze_ops = ["prim::Uninitialized", "prim::unchecked_cast", "aten::append"] diff --git a/src/bindings/python/src/openvino/op/__init__.py b/src/bindings/python/src/openvino/op/__init__.py new file mode 100644 index 00000000000000..ee324dd76380f7 --- /dev/null +++ b/src/bindings/python/src/openvino/op/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +""" +Package: openvino.op +Low level wrappers for the c++ api in ov::op. +""" + +# flake8: noqa + +from openvino._pyopenvino.op import Constant +from openvino._pyopenvino.op import assign +from openvino._pyopenvino.op import _PagedAttentionExtension +from openvino._pyopenvino.op import Parameter +from openvino._pyopenvino.op import if_op +from openvino._pyopenvino.op import loop +from openvino._pyopenvino.op import tensor_iterator +from openvino._pyopenvino.op import read_value +from openvino._pyopenvino.op import Result diff --git a/src/bindings/python/src/openvino/op/util/__init__.py b/src/bindings/python/src/openvino/op/util/__init__.py new file mode 100644 index 00000000000000..0c946b115f451e --- /dev/null +++ b/src/bindings/python/src/openvino/op/util/__init__.py @@ -0,0 +1,22 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +""" +Package: openvino.op.util +Low level wrappers for the c++ api in ov::op::util. +""" +# flake8: noqa + +from openvino._pyopenvino.op.util import UnaryElementwiseArithmetic +from openvino._pyopenvino.op.util import BinaryElementwiseComparison +from openvino._pyopenvino.op.util import BinaryElementwiseArithmetic +from openvino._pyopenvino.op.util import BinaryElementwiseLogical +from openvino._pyopenvino.op.util import ArithmeticReduction +from openvino._pyopenvino.op.util import IndexReduction +from openvino._pyopenvino.op.util import VariableInfo +from openvino._pyopenvino.op.util import Variable +from openvino._pyopenvino.op.util import MergedInputDescription +from openvino._pyopenvino.op.util import InvariantInputDescription +from openvino._pyopenvino.op.util import SliceInputDescription +from openvino._pyopenvino.op.util import ConcatOutputDescription +from openvino._pyopenvino.op.util import BodyOutputDescription diff --git a/src/bindings/python/src/openvino/opset1/__init__.py b/src/bindings/python/src/openvino/opset1/__init__.py new file mode 100644 index 00000000000000..35ceec6b0d9892 --- /dev/null +++ b/src/bindings/python/src/openvino/opset1/__init__.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset1.ops import atan +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset1.ops import binary_convolution +from openvino.opset1.ops import broadcast +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset1.ops import hard_sigmoid +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset1.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset1.ops import negative +from openvino.opset1.ops import non_max_suppression +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset1.ops import proposal +from openvino.opset1.ops import range +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset1.ops import shape_of +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset1.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset1/ops.py b/src/bindings/python/src/openvino/opset1/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset1/ops.py rename to src/bindings/python/src/openvino/opset1/ops.py index 54f32d404336d4..edca6c62a0b246 100644 --- a/src/bindings/python/src/openvino/runtime/opset1/ops.py +++ b/src/bindings/python/src/openvino/opset1/ops.py @@ -9,7 +9,7 @@ from functools import partial from openvino.runtime import Node, PartialShape, Type -from openvino.runtime.op import Constant, Parameter, tensor_iterator +from openvino.op import Constant, Parameter, tensor_iterator from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op from openvino.runtime.utils.input_validation import ( diff --git a/src/bindings/python/src/openvino/opset10/__init__.py b/src/bindings/python/src/openvino/opset10/__init__.py new file mode 100644 index 00000000000000..659cb99bb8644d --- /dev/null +++ b/src/bindings/python/src/openvino/opset10/__init__.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset10.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset10/ops.py b/src/bindings/python/src/openvino/opset10/ops.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/opset10/ops.py rename to src/bindings/python/src/openvino/opset10/ops.py diff --git a/src/bindings/python/src/openvino/opset11/__init__.py b/src/bindings/python/src/openvino/opset11/__init__.py new file mode 100644 index 00000000000000..2a07b9fab9f2c4 --- /dev/null +++ b/src/bindings/python/src/openvino/opset11/__init__.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset11/ops.py b/src/bindings/python/src/openvino/opset11/ops.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/opset11/ops.py rename to src/bindings/python/src/openvino/opset11/ops.py diff --git a/src/bindings/python/src/openvino/opset12/__init__.py b/src/bindings/python/src/openvino/opset12/__init__.py new file mode 100644 index 00000000000000..1187f2c83e6a05 --- /dev/null +++ b/src/bindings/python/src/openvino/opset12/__init__.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset12.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset12/ops.py b/src/bindings/python/src/openvino/opset12/ops.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/opset12/ops.py rename to src/bindings/python/src/openvino/opset12/ops.py diff --git a/src/bindings/python/src/openvino/opset13/__init__.py b/src/bindings/python/src/openvino/opset13/__init__.py new file mode 100644 index 00000000000000..ab3f541e8f831c --- /dev/null +++ b/src/bindings/python/src/openvino/opset13/__init__.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset13.ops import bitwise_and +from openvino.opset13.ops import bitwise_not +from openvino.opset13.ops import bitwise_or +from openvino.opset13.ops import bitwise_xor +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset13.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset13.ops import fake_convert +from openvino.opset13.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset13.ops import multinomial +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset13.ops import nms_rotated +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset13.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset13.ops import scaled_dot_product_attention +from openvino.opset12.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset13/ops.py b/src/bindings/python/src/openvino/opset13/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset13/ops.py rename to src/bindings/python/src/openvino/opset13/ops.py index a624ffb4f79873..12f0d06b1a28e6 100644 --- a/src/bindings/python/src/openvino/runtime/opset13/ops.py +++ b/src/bindings/python/src/openvino/opset13/ops.py @@ -12,8 +12,8 @@ log = logging.getLogger(__name__) from openvino.runtime import Node, Shape, Type, Output, Tensor -from openvino.runtime.op import Constant, Result -from openvino.runtime.opset1 import convert_like +from openvino.op import Constant, Result +from openvino.opset1 import convert_like from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op, overloading from openvino.runtime.utils.types import ( diff --git a/src/bindings/python/src/openvino/opset14/__init__.py b/src/bindings/python/src/openvino/opset14/__init__.py new file mode 100644 index 00000000000000..8a503a333bd3e1 --- /dev/null +++ b/src/bindings/python/src/openvino/opset14/__init__.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset14.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset13.ops import bitwise_and +from openvino.opset13.ops import bitwise_not +from openvino.opset13.ops import bitwise_or +from openvino.opset13.ops import bitwise_xor +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset13.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset14.ops import convert_promote_types +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset13.ops import fake_convert +from openvino.opset13.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset14.ops import inverse +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset14.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset13.ops import multinomial +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset13.ops import nms_rotated +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset13.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset13.ops import scaled_dot_product_attention +from openvino.opset12.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset14/ops.py b/src/bindings/python/src/openvino/opset14/ops.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/opset14/ops.py rename to src/bindings/python/src/openvino/opset14/ops.py diff --git a/src/bindings/python/src/openvino/opset15/__init__.py b/src/bindings/python/src/openvino/opset15/__init__.py new file mode 100644 index 00000000000000..d5b93924a69e6e --- /dev/null +++ b/src/bindings/python/src/openvino/opset15/__init__.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# New operations added in Opset15 +from openvino.opset15.ops import col2im +from openvino.opset15.ops import embedding_bag_offsets +from openvino.opset15.ops import embedding_bag_packed +from openvino.opset15.ops import scatter_nd_update +from openvino.opset15.ops import roi_align_rotated +from openvino.opset15.ops import string_tensor_pack +from openvino.opset15.ops import string_tensor_unpack +from openvino.opset15.ops import bitwise_left_shift +from openvino.opset15.ops import bitwise_right_shift +from openvino.opset15.ops import slice_scatter + +# Operators from previous opsets +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset14.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset13.ops import bitwise_and +from openvino.opset13.ops import bitwise_not +from openvino.opset13.ops import bitwise_or +from openvino.opset13.ops import bitwise_xor +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset13.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset14.ops import convert_promote_types +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset13.ops import fake_convert +from openvino.opset13.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset14.ops import inverse +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset14.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset13.ops import multinomial +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset13.ops import nms_rotated +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset13.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset13.ops import scaled_dot_product_attention +from openvino.opset12.ops import scatter_elements_update +from openvino.opset3.ops import scatter_update +from openvino.opset15.ops import search_sorted +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset15.ops import squeeze +from openvino.opset15.ops import stft +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops.py b/src/bindings/python/src/openvino/opset15/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset15/ops.py rename to src/bindings/python/src/openvino/opset15/ops.py index 93aacb29572340..8e6b8bd46d5f7c 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/ops.py +++ b/src/bindings/python/src/openvino/opset15/ops.py @@ -8,8 +8,8 @@ import numpy as np from openvino.runtime import Node, Type -from openvino.runtime.opset1 import convert_like -from openvino.runtime.opset14 import constant +from openvino.opset1 import convert_like +from openvino.opset14 import constant from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op from openvino.runtime.utils.types import NodeInput, as_nodes diff --git a/src/bindings/python/src/openvino/opset16/__init__.py b/src/bindings/python/src/openvino/opset16/__init__.py new file mode 100644 index 00000000000000..06fa9d2e7d0070 --- /dev/null +++ b/src/bindings/python/src/openvino/opset16/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# New operations added in Opset16 +from openvino.opset16.ops import identity + +# Operators from previous opsets +# TODO (ticket: 156877): Add previous opset operators at the end of opset16 development diff --git a/src/bindings/python/src/openvino/runtime/opset16/ops.py b/src/bindings/python/src/openvino/opset16/ops.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/opset16/ops.py rename to src/bindings/python/src/openvino/opset16/ops.py diff --git a/src/bindings/python/src/openvino/opset2/__init__.py b/src/bindings/python/src/openvino/opset2/__init__.py new file mode 100644 index 00000000000000..1306c89b5241d8 --- /dev/null +++ b/src/bindings/python/src/openvino/opset2/__init__.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset1.ops import atan +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset1.ops import broadcast +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset1.ops import hard_sigmoid +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset1.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset1.ops import non_max_suppression +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset1.ops import proposal +from openvino.opset1.ops import range +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset2.ops import roi_pooling +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset1.ops import shape_of +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset1.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset2/ops.py b/src/bindings/python/src/openvino/opset2/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset2/ops.py rename to src/bindings/python/src/openvino/opset2/ops.py index 287232d114c6bc..45b33f5bc0288b 100644 --- a/src/bindings/python/src/openvino/runtime/opset2/ops.py +++ b/src/bindings/python/src/openvino/opset2/ops.py @@ -10,7 +10,7 @@ import warnings from openvino.runtime import Node, Shape -from openvino.runtime.op import Constant, Parameter +from openvino.op import Constant, Parameter from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op from openvino.runtime.utils.input_validation import ( diff --git a/src/bindings/python/src/openvino/opset3/__init__.py b/src/bindings/python/src/openvino/opset3/__init__.py new file mode 100644 index 00000000000000..3a0baa675114f1 --- /dev/null +++ b/src/bindings/python/src/openvino/opset3/__init__.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset3.ops import assign +from openvino.opset1.ops import atan +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset1.ops import hard_sigmoid +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset1.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset3.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset1.ops import proposal +from openvino.opset1.ops import range +from openvino.opset3.ops import read_value +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset3.ops import scatter_elements_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset3/ops.py b/src/bindings/python/src/openvino/opset3/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset3/ops.py rename to src/bindings/python/src/openvino/opset3/ops.py index d3403dbbbd2be8..989f5819acb685 100644 --- a/src/bindings/python/src/openvino/runtime/opset3/ops.py +++ b/src/bindings/python/src/openvino/opset3/ops.py @@ -9,7 +9,7 @@ from functools import partial from openvino.runtime import Node, Shape -from openvino.runtime.op import Constant, Parameter +from openvino.op import Constant, Parameter from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op from openvino.runtime.utils.input_validation import ( diff --git a/src/bindings/python/src/openvino/opset4/__init__.py b/src/bindings/python/src/openvino/opset4/__init__.py new file mode 100644 index 00000000000000..6096ec431c796e --- /dev/null +++ b/src/bindings/python/src/openvino/opset4/__init__.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset3.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset1.ops import hard_sigmoid +from openvino.opset4.ops import hswish +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset4.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset3.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset4/ops.py b/src/bindings/python/src/openvino/opset4/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset4/ops.py rename to src/bindings/python/src/openvino/opset4/ops.py index dce4879bdb38f6..4f6ba016852b02 100644 --- a/src/bindings/python/src/openvino/runtime/opset4/ops.py +++ b/src/bindings/python/src/openvino/opset4/ops.py @@ -9,7 +9,7 @@ from functools import partial from openvino.runtime import Node, Shape -from openvino.runtime.op import Constant, Parameter +from openvino.op import Constant, Parameter from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op from openvino.runtime.utils.input_validation import ( diff --git a/src/bindings/python/src/openvino/opset5/__init__.py b/src/bindings/python/src/openvino/opset5/__init__.py new file mode 100644 index 00000000000000..202b8137093f57 --- /dev/null +++ b/src/bindings/python/src/openvino/opset5/__init__.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset3.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset5.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset3.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset5/ops.py b/src/bindings/python/src/openvino/opset5/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset5/ops.py rename to src/bindings/python/src/openvino/opset5/ops.py index 11e05aa1b7bcb4..20057b78c7c31d 100644 --- a/src/bindings/python/src/openvino/runtime/opset5/ops.py +++ b/src/bindings/python/src/openvino/opset5/ops.py @@ -9,7 +9,7 @@ from functools import partial from openvino.runtime import Node, Shape -from openvino.runtime.op import Constant, Parameter, loop +from openvino.op import Constant, Parameter, loop from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op from openvino.runtime.utils.input_validation import ( diff --git a/src/bindings/python/src/openvino/opset6/__init__.py b/src/bindings/python/src/openvino/opset6/__init__.py new file mode 100644 index 00000000000000..315d80bc025a38 --- /dev/null +++ b/src/bindings/python/src/openvino/opset6/__init__.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset5.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset6/ops.py b/src/bindings/python/src/openvino/opset6/ops.py similarity index 97% rename from src/bindings/python/src/openvino/runtime/opset6/ops.py rename to src/bindings/python/src/openvino/opset6/ops.py index 3209d7a8a53c99..8020715f20dea3 100644 --- a/src/bindings/python/src/openvino/runtime/opset6/ops.py +++ b/src/bindings/python/src/openvino/opset6/ops.py @@ -10,9 +10,9 @@ from functools import partial, singledispatch from openvino.runtime import Node, Type, PartialShape, Output, Shape -from openvino.runtime.op import assign, Constant, Parameter -from openvino.runtime.op import read_value as _read_value -from openvino.runtime.op.util import VariableInfo, Variable +from openvino.op import assign, Constant, Parameter +from openvino.op import read_value as _read_value +from openvino.op.util import VariableInfo, Variable from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import nameable_op, overloading from openvino.runtime.utils.types import ( diff --git a/src/bindings/python/src/openvino/opset7/__init__.py b/src/bindings/python/src/openvino/opset7/__init__.py new file mode 100644 index 00000000000000..16703072285487 --- /dev/null +++ b/src/bindings/python/src/openvino/opset7/__init__.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset7.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset5.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset7/ops.py b/src/bindings/python/src/openvino/opset7/ops.py similarity index 98% rename from src/bindings/python/src/openvino/runtime/opset7/ops.py rename to src/bindings/python/src/openvino/opset7/ops.py index 7cadaa42b80443..59e09b64888eb1 100644 --- a/src/bindings/python/src/openvino/runtime/opset7/ops.py +++ b/src/bindings/python/src/openvino/opset7/ops.py @@ -8,7 +8,7 @@ import numpy as np from openvino.runtime import Node, Shape -from openvino.runtime.op import Constant, Parameter +from openvino.op import Constant, Parameter from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import binary_op, nameable_op, unary_op from openvino.runtime.utils.input_validation import ( diff --git a/src/bindings/python/src/openvino/opset8/__init__.py b/src/bindings/python/src/openvino/opset8/__init__.py new file mode 100644 index 00000000000000..e45c37863b193b --- /dev/null +++ b/src/bindings/python/src/openvino/opset8/__init__.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset1.ops import interpolate +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset8.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset8/ops.py b/src/bindings/python/src/openvino/opset8/ops.py similarity index 99% rename from src/bindings/python/src/openvino/runtime/opset8/ops.py rename to src/bindings/python/src/openvino/opset8/ops.py index f9176ecd32be14..05b97390baa780 100644 --- a/src/bindings/python/src/openvino/runtime/opset8/ops.py +++ b/src/bindings/python/src/openvino/opset8/ops.py @@ -8,7 +8,7 @@ import numpy as np from openvino.runtime.exceptions import UserInputError -from openvino.runtime.op import Constant, Parameter, if_op +from openvino.op import Constant, Parameter, if_op from openvino.runtime import Node from openvino.runtime.opset_utils import _get_node_factory from openvino.runtime.utils.decorators import nameable_op diff --git a/src/bindings/python/src/openvino/opset9/__init__.py b/src/bindings/python/src/openvino/opset9/__init__.py new file mode 100644 index 00000000000000..03051d46b58759 --- /dev/null +++ b/src/bindings/python/src/openvino/opset9/__init__.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset1.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset9/ops.py b/src/bindings/python/src/openvino/opset9/ops.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/opset9/ops.py rename to src/bindings/python/src/openvino/opset9/ops.py diff --git a/src/bindings/python/src/openvino/passes/__init__.py b/src/bindings/python/src/openvino/passes/__init__.py new file mode 100644 index 00000000000000..037d9774c5b9a0 --- /dev/null +++ b/src/bindings/python/src/openvino/passes/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# type: ignore +# flake8: noqa + +from openvino._pyopenvino.passes import ModelPass, Matcher, MatcherPass, PassBase, WrapType, Or, AnyInput, Optional +from openvino._pyopenvino.passes import ( + consumers_count, + has_static_dim, + has_static_dims, + has_static_shape, + has_static_rank, + rank_equals, + type_matches, + type_matches_any, +) +from openvino._pyopenvino.passes import Serialize, ConstantFolding, VisualizeTree, MakeStateful, LowLatency2, ConvertFP32ToFP16, Version +from openvino.passes.manager import Manager +from openvino.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite diff --git a/src/bindings/python/src/openvino/runtime/passes/graph_rewrite.py b/src/bindings/python/src/openvino/passes/graph_rewrite.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/passes/graph_rewrite.py rename to src/bindings/python/src/openvino/passes/graph_rewrite.py diff --git a/src/bindings/python/src/openvino/runtime/passes/manager.py b/src/bindings/python/src/openvino/passes/manager.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/passes/manager.py rename to src/bindings/python/src/openvino/passes/manager.py diff --git a/src/bindings/python/src/openvino/runtime/__init__.py b/src/bindings/python/src/openvino/runtime/__init__.py index df2cab57bd0aba..e22e2be08cacea 100644 --- a/src/bindings/python/src/openvino/runtime/__init__.py +++ b/src/bindings/python/src/openvino/runtime/__init__.py @@ -58,6 +58,9 @@ from openvino.runtime import opset11 from openvino.runtime import opset12 from openvino.runtime import opset13 +from openvino.runtime import opset14 +from openvino.runtime import opset15 +from openvino.runtime import opset16 # Import properties API from openvino.runtime import properties diff --git a/src/bindings/python/src/openvino/runtime/op/__init__.py b/src/bindings/python/src/openvino/runtime/op/__init__.py index ee324dd76380f7..d7dd935d05f8d3 100644 --- a/src/bindings/python/src/openvino/runtime/op/__init__.py +++ b/src/bindings/python/src/openvino/runtime/op/__init__.py @@ -8,12 +8,12 @@ # flake8: noqa -from openvino._pyopenvino.op import Constant -from openvino._pyopenvino.op import assign -from openvino._pyopenvino.op import _PagedAttentionExtension -from openvino._pyopenvino.op import Parameter -from openvino._pyopenvino.op import if_op -from openvino._pyopenvino.op import loop -from openvino._pyopenvino.op import tensor_iterator -from openvino._pyopenvino.op import read_value -from openvino._pyopenvino.op import Result +from openvino.op import Constant +from openvino.op import assign +from openvino.op import _PagedAttentionExtension +from openvino.op import Parameter +from openvino.op import if_op +from openvino.op import loop +from openvino.op import tensor_iterator +from openvino.op import read_value +from openvino.op import Result diff --git a/src/bindings/python/src/openvino/runtime/op/util/__init__.py b/src/bindings/python/src/openvino/runtime/op/util/__init__.py index 0c946b115f451e..642434446c247d 100644 --- a/src/bindings/python/src/openvino/runtime/op/util/__init__.py +++ b/src/bindings/python/src/openvino/runtime/op/util/__init__.py @@ -7,16 +7,16 @@ """ # flake8: noqa -from openvino._pyopenvino.op.util import UnaryElementwiseArithmetic -from openvino._pyopenvino.op.util import BinaryElementwiseComparison -from openvino._pyopenvino.op.util import BinaryElementwiseArithmetic -from openvino._pyopenvino.op.util import BinaryElementwiseLogical -from openvino._pyopenvino.op.util import ArithmeticReduction -from openvino._pyopenvino.op.util import IndexReduction -from openvino._pyopenvino.op.util import VariableInfo -from openvino._pyopenvino.op.util import Variable -from openvino._pyopenvino.op.util import MergedInputDescription -from openvino._pyopenvino.op.util import InvariantInputDescription -from openvino._pyopenvino.op.util import SliceInputDescription -from openvino._pyopenvino.op.util import ConcatOutputDescription -from openvino._pyopenvino.op.util import BodyOutputDescription +from openvino.op.util import UnaryElementwiseArithmetic +from openvino.op.util import BinaryElementwiseComparison +from openvino.op.util import BinaryElementwiseArithmetic +from openvino.op.util import BinaryElementwiseLogical +from openvino.op.util import ArithmeticReduction +from openvino.op.util import IndexReduction +from openvino.op.util import VariableInfo +from openvino.op.util import Variable +from openvino.op.util import MergedInputDescription +from openvino.op.util import InvariantInputDescription +from openvino.op.util import SliceInputDescription +from openvino.op.util import ConcatOutputDescription +from openvino.op.util import BodyOutputDescription diff --git a/src/bindings/python/src/openvino/runtime/opset1/__init__.py b/src/bindings/python/src/openvino/runtime/opset1/__init__.py index ca7e1aef385b2a..35ceec6b0d9892 100644 --- a/src/bindings/python/src/openvino/runtime/opset1/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset1/__init__.py @@ -2,110 +2,110 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset1.ops import batch_norm_inference -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset1.ops import broadcast -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset1.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset1.ops import detection_output -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset1.ops import gather -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset1.ops import lstm_cell -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset1.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset1.ops import non_max_suppression -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset1.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset1.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset1.ops import shape_of -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset1.ops import softmax -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset1.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset1.ops import atan +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset1.ops import binary_convolution +from openvino.opset1.ops import broadcast +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset1.ops import hard_sigmoid +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset1.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset1.ops import negative +from openvino.opset1.ops import non_max_suppression +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset1.ops import proposal +from openvino.opset1.ops import range +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset1.ops import shape_of +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset1.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset1/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset1/ops/__init__.py new file mode 100644 index 00000000000000..acdd7d8becf465 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset1/ops/__init__.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset1.ops import absolute +from openvino.opset1.ops import acos +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset1.ops import atan +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset1.ops import binary_convolution +from openvino.opset1.ops import broadcast +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset1.ops import hard_sigmoid +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset1.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset1.ops import negative +from openvino.opset1.ops import non_max_suppression +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset1.ops import proposal +from openvino.opset1.ops import range +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset1.ops import shape_of +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset1.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset10/__init__.py b/src/bindings/python/src/openvino/runtime/opset10/__init__.py index 29ebcb27039abc..659cb99bb8644d 100644 --- a/src/bindings/python/src/openvino/runtime/opset10/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset10/__init__.py @@ -2,178 +2,178 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset9.ops import eye -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset9.ops import generate_proposals -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset9.ops import grid_sample -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset10.ops import interpolate -from openvino.runtime.opset9.ops import irdft -from openvino.runtime.opset10.ops import is_finite -from openvino.runtime.opset10.ops import is_inf -from openvino.runtime.opset10.ops import is_nan -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset8.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset9.ops import multiclass_nms -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset9.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset9.ops import rdft -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset9.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset9.ops import softsign -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset10.ops import unique -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset10.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset10/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset10/ops/__init__.py new file mode 100644 index 00000000000000..f6c134d9ab563e --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset10/ops/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset10.ops import interpolate +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset10.ops import unique diff --git a/src/bindings/python/src/openvino/runtime/opset11/__init__.py b/src/bindings/python/src/openvino/runtime/opset11/__init__.py index b692741257d435..2a07b9fab9f2c4 100644 --- a/src/bindings/python/src/openvino/runtime/opset11/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset11/__init__.py @@ -2,178 +2,178 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset9.ops import eye -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset9.ops import generate_proposals -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset9.ops import grid_sample -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset11.ops import interpolate -from openvino.runtime.opset9.ops import irdft -from openvino.runtime.opset10.ops import is_finite -from openvino.runtime.opset10.ops import is_inf -from openvino.runtime.opset10.ops import is_nan -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset8.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset9.ops import multiclass_nms -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset9.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset9.ops import rdft -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset9.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset9.ops import softsign -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset11.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset10.ops import unique -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset11/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset11/ops/__init__.py new file mode 100644 index 00000000000000..2ef74d4a6b58db --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset11/ops/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset11.ops import interpolate +from openvino.opset11.ops import topk diff --git a/src/bindings/python/src/openvino/runtime/opset12/__init__.py b/src/bindings/python/src/openvino/runtime/opset12/__init__.py index 381d82ce8c9be1..1187f2c83e6a05 100644 --- a/src/bindings/python/src/openvino/runtime/opset12/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset12/__init__.py @@ -2,179 +2,179 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset9.ops import eye -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset9.ops import generate_proposals -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset9.ops import grid_sample -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset12.ops import group_normalization -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset11.ops import interpolate -from openvino.runtime.opset9.ops import irdft -from openvino.runtime.opset10.ops import is_finite -from openvino.runtime.opset10.ops import is_inf -from openvino.runtime.opset10.ops import is_nan -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset8.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset9.ops import multiclass_nms -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset9.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset12.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset4.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset9.ops import rdft -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset9.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset12.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset9.ops import softsign -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset11.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset10.ops import unique -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset12.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset12/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset12/ops/__init__.py new file mode 100644 index 00000000000000..b6ba8fe643b381 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset12/ops/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset12.ops import group_normalization +from openvino.opset12.ops import pad +from openvino.opset12.ops import scatter_elements_update diff --git a/src/bindings/python/src/openvino/runtime/opset13/__init__.py b/src/bindings/python/src/openvino/runtime/opset13/__init__.py index 7f330fbc87766d..ab3f541e8f831c 100644 --- a/src/bindings/python/src/openvino/runtime/opset13/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset13/__init__.py @@ -2,187 +2,187 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset13.ops import bitwise_and -from openvino.runtime.opset13.ops import bitwise_not -from openvino.runtime.opset13.ops import bitwise_or -from openvino.runtime.opset13.ops import bitwise_xor -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset13.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset9.ops import eye -from openvino.runtime.opset13.ops import fake_convert -from openvino.runtime.opset13.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset9.ops import generate_proposals -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset9.ops import grid_sample -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset12.ops import group_normalization -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset11.ops import interpolate -from openvino.runtime.opset9.ops import irdft -from openvino.runtime.opset10.ops import is_finite -from openvino.runtime.opset10.ops import is_inf -from openvino.runtime.opset10.ops import is_nan -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset8.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset9.ops import multiclass_nms -from openvino.runtime.opset13.ops import multinomial -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset13.ops import nms_rotated -from openvino.runtime.opset9.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset12.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset4.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset9.ops import rdft -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset13.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset9.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset13.ops import scaled_dot_product_attention -from openvino.runtime.opset12.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset9.ops import softsign -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset11.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset10.ops import unique -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset13.ops import bitwise_and +from openvino.opset13.ops import bitwise_not +from openvino.opset13.ops import bitwise_or +from openvino.opset13.ops import bitwise_xor +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset13.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset13.ops import fake_convert +from openvino.opset13.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset13.ops import multinomial +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset13.ops import nms_rotated +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset13.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset13.ops import scaled_dot_product_attention +from openvino.opset12.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset13/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset13/ops/__init__.py new file mode 100644 index 00000000000000..edc66e873e4779 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset13/ops/__init__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset13.ops import bitwise_and +from openvino.opset13.ops import bitwise_not +from openvino.opset13.ops import bitwise_or +from openvino.opset13.ops import bitwise_xor +from openvino.opset13.ops import constant +from openvino.opset13.ops import fake_convert +from openvino.opset13.ops import fake_quantize +from openvino.opset13.ops import multinomial +from openvino.opset13.ops import nms_rotated +from openvino.opset13.ops import result +from openvino.opset13.ops import scaled_dot_product_attention diff --git a/src/bindings/python/src/openvino/runtime/opset14/__init__.py b/src/bindings/python/src/openvino/runtime/opset14/__init__.py index 52ac785bd723e5..8a503a333bd3e1 100644 --- a/src/bindings/python/src/openvino/runtime/opset14/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset14/__init__.py @@ -2,189 +2,189 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset14.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset13.ops import bitwise_and -from openvino.runtime.opset13.ops import bitwise_not -from openvino.runtime.opset13.ops import bitwise_or -from openvino.runtime.opset13.ops import bitwise_xor -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset13.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset14.ops import convert_promote_types -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset9.ops import eye -from openvino.runtime.opset13.ops import fake_convert -from openvino.runtime.opset13.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset9.ops import generate_proposals -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset9.ops import grid_sample -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset12.ops import group_normalization -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset11.ops import interpolate -from openvino.runtime.opset14.ops import inverse -from openvino.runtime.opset9.ops import irdft -from openvino.runtime.opset10.ops import is_finite -from openvino.runtime.opset10.ops import is_inf -from openvino.runtime.opset10.ops import is_nan -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset14.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset9.ops import multiclass_nms -from openvino.runtime.opset13.ops import multinomial -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset13.ops import nms_rotated -from openvino.runtime.opset9.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset12.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset4.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset9.ops import rdft -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset13.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset9.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset13.ops import scaled_dot_product_attention -from openvino.runtime.opset12.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset9.ops import softsign -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset11.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset10.ops import unique -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset14.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset13.ops import bitwise_and +from openvino.opset13.ops import bitwise_not +from openvino.opset13.ops import bitwise_or +from openvino.opset13.ops import bitwise_xor +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset13.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset14.ops import convert_promote_types +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset13.ops import fake_convert +from openvino.opset13.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset14.ops import inverse +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset14.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset13.ops import multinomial +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset13.ops import nms_rotated +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset13.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset13.ops import scaled_dot_product_attention +from openvino.opset12.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset14/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset14/ops/__init__.py new file mode 100644 index 00000000000000..8f5d09b5d0733e --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset14/ops/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset14.ops import avg_pool +from openvino.opset14.ops import convert_promote_types +from openvino.opset14.ops import inverse +from openvino.opset14.ops import max_pool diff --git a/src/bindings/python/src/openvino/runtime/opset15/__init__.py b/src/bindings/python/src/openvino/runtime/opset15/__init__.py index c4dd48d9087ae1..d5b93924a69e6e 100644 --- a/src/bindings/python/src/openvino/runtime/opset15/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset15/__init__.py @@ -3,202 +3,202 @@ # SPDX-License-Identifier: Apache-2.0 # New operations added in Opset15 -from openvino.runtime.opset15.ops import col2im -from openvino.runtime.opset15.ops import embedding_bag_offsets -from openvino.runtime.opset15.ops import embedding_bag_packed -from openvino.runtime.opset15.ops import scatter_nd_update -from openvino.runtime.opset15.ops import roi_align_rotated -from openvino.runtime.opset15.ops import string_tensor_pack -from openvino.runtime.opset15.ops import string_tensor_unpack -from openvino.runtime.opset15.ops import bitwise_left_shift -from openvino.runtime.opset15.ops import bitwise_right_shift -from openvino.runtime.opset15.ops import slice_scatter +from openvino.opset15.ops import col2im +from openvino.opset15.ops import embedding_bag_offsets +from openvino.opset15.ops import embedding_bag_packed +from openvino.opset15.ops import scatter_nd_update +from openvino.opset15.ops import roi_align_rotated +from openvino.opset15.ops import string_tensor_pack +from openvino.opset15.ops import string_tensor_unpack +from openvino.opset15.ops import bitwise_left_shift +from openvino.opset15.ops import bitwise_right_shift +from openvino.opset15.ops import slice_scatter # Operators from previous opsets -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset14.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset13.ops import bitwise_and -from openvino.runtime.opset13.ops import bitwise_not -from openvino.runtime.opset13.ops import bitwise_or -from openvino.runtime.opset13.ops import bitwise_xor -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset13.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset14.ops import convert_promote_types -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset9.ops import eye -from openvino.runtime.opset13.ops import fake_convert -from openvino.runtime.opset13.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset9.ops import generate_proposals -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset9.ops import grid_sample -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset12.ops import group_normalization -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset11.ops import interpolate -from openvino.runtime.opset14.ops import inverse -from openvino.runtime.opset9.ops import irdft -from openvino.runtime.opset10.ops import is_finite -from openvino.runtime.opset10.ops import is_inf -from openvino.runtime.opset10.ops import is_nan -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset14.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset9.ops import multiclass_nms -from openvino.runtime.opset13.ops import multinomial -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset13.ops import nms_rotated -from openvino.runtime.opset9.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset12.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset4.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset9.ops import rdft -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset13.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset9.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset13.ops import scaled_dot_product_attention -from openvino.runtime.opset12.ops import scatter_elements_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset15.ops import search_sorted -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset9.ops import softsign -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset15.ops import squeeze -from openvino.runtime.opset15.ops import stft -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset11.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset10.ops import unique -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset14.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset13.ops import bitwise_and +from openvino.opset13.ops import bitwise_not +from openvino.opset13.ops import bitwise_or +from openvino.opset13.ops import bitwise_xor +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset13.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset14.ops import convert_promote_types +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset13.ops import fake_convert +from openvino.opset13.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset12.ops import group_normalization +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset11.ops import interpolate +from openvino.opset14.ops import inverse +from openvino.opset9.ops import irdft +from openvino.opset10.ops import is_finite +from openvino.opset10.ops import is_inf +from openvino.opset10.ops import is_nan +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset14.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset13.ops import multinomial +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset13.ops import nms_rotated +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset12.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset13.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset13.ops import scaled_dot_product_attention +from openvino.opset12.ops import scatter_elements_update +from openvino.opset3.ops import scatter_update +from openvino.opset15.ops import search_sorted +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset15.ops import squeeze +from openvino.opset15.ops import stft +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset11.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset10.ops import unique +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset15/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset15/ops/__init__.py new file mode 100644 index 00000000000000..57543286d31543 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset15/ops/__init__.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset15.ops import col2im +from openvino.opset15.ops import embedding_bag_offsets +from openvino.opset15.ops import embedding_bag_packed +from openvino.opset15.ops import scatter_nd_update +from openvino.opset15.ops import roi_align_rotated +from openvino.opset15.ops import string_tensor_pack +from openvino.opset15.ops import string_tensor_unpack +from openvino.opset15.ops import bitwise_left_shift +from openvino.opset15.ops import bitwise_right_shift +from openvino.opset15.ops import slice_scatter +from openvino.opset15.ops import search_sorted +from openvino.opset15.ops import squeeze +from openvino.opset15.ops import stft diff --git a/src/bindings/python/src/openvino/runtime/opset16/__init__.py b/src/bindings/python/src/openvino/runtime/opset16/__init__.py index ce52690e919fc3..06fa9d2e7d0070 100644 --- a/src/bindings/python/src/openvino/runtime/opset16/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset16/__init__.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # New operations added in Opset16 -from openvino.runtime.opset16.ops import identity +from openvino.opset16.ops import identity # Operators from previous opsets # TODO (ticket: 156877): Add previous opset operators at the end of opset16 development diff --git a/src/bindings/python/src/openvino/runtime/opset16/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset16/ops/__init__.py new file mode 100644 index 00000000000000..d28ce61bd00d54 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset16/ops/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset16.ops import identity diff --git a/src/bindings/python/src/openvino/runtime/opset2/__init__.py b/src/bindings/python/src/openvino/runtime/opset2/__init__.py index 6624149e157e9e..1306c89b5241d8 100644 --- a/src/bindings/python/src/openvino/runtime/opset2/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset2/__init__.py @@ -2,116 +2,116 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset1.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset1.ops import broadcast -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset1.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset1.ops import detection_output -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset1.ops import gather -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset2.ops import gelu -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset1.ops import lstm_cell -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset1.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset2.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset1.ops import non_max_suppression -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset1.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset1.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset1.ops import shape_of -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset1.ops import softmax -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset1.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset1.ops import atan +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset1.ops import broadcast +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset1.ops import hard_sigmoid +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset1.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset1.ops import non_max_suppression +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset1.ops import proposal +from openvino.opset1.ops import range +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset2.ops import roi_pooling +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset1.ops import shape_of +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset1.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset2/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset2/ops/__init__.py new file mode 100644 index 00000000000000..89d30e8521e6d8 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset2/ops/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset2.ops import batch_to_space +from openvino.opset2.ops import gelu +from openvino.opset2.ops import mvn +from openvino.opset2.ops import reorg_yolo +from openvino.opset2.ops import roi_pooling +from openvino.opset2.ops import space_to_batch diff --git a/src/bindings/python/src/openvino/runtime/opset3/__init__.py b/src/bindings/python/src/openvino/runtime/opset3/__init__.py index 5bd68912cae807..3a0baa675114f1 100644 --- a/src/bindings/python/src/openvino/runtime/opset3/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset3/__init__.py @@ -2,132 +2,132 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset3.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset1.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset1.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset1.ops import detection_output -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset1.ops import gather -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset2.ops import gelu -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset1.ops import lstm_cell -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset1.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset2.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset3.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset1.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset1.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset3.ops import read_value -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset3.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset1.ops import softmax -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset3.ops import assign +from openvino.opset1.ops import atan +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset1.ops import hard_sigmoid +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset1.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset3.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset1.ops import proposal +from openvino.opset1.ops import range +from openvino.opset3.ops import read_value +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset3.ops import scatter_elements_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset3/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset3/ops/__init__.py new file mode 100644 index 00000000000000..b8af66f9b514c8 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset3/ops/__init__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset3.ops import assign +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset3.ops import gru_cell +from openvino.opset3.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset3.ops import read_value +from openvino.opset3.ops import rnn_cell +from openvino.opset3.ops import roi_align +from openvino.opset3.ops import scatter_elements_update +from openvino.opset3.ops import scatter_update +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset3.ops import topk diff --git a/src/bindings/python/src/openvino/runtime/opset4/__init__.py b/src/bindings/python/src/openvino/runtime/opset4/__init__.py index e7cef8eb216d17..6096ec431c796e 100644 --- a/src/bindings/python/src/openvino/runtime/opset4/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset4/__init__.py @@ -2,143 +2,143 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset3.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset1.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset1.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset1.ops import detection_output -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset1.ops import gather -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset2.ops import gelu -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset1.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset2.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset4.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset1.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset3.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset3.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset1.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset3.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset1.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset1.ops import hard_sigmoid +from openvino.opset4.ops import hswish +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset4.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset3.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset4/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset4/ops/__init__.py new file mode 100644 index 00000000000000..d70649a6b1db17 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset4/ops/__init__.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset4.ops import acosh +from openvino.opset4.ops import asinh +from openvino.opset4.ops import atanh +from openvino.opset4.ops import ctc_loss +from openvino.opset4.ops import hswish +from openvino.opset4.ops import lstm_cell +from openvino.opset4.ops import mish +from openvino.opset4.ops import non_max_suppression +from openvino.opset4.ops import proposal +from openvino.opset4.ops import range +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset4.ops import scatter_nd_update +from openvino.opset4.ops import softplus +from openvino.opset4.ops import swish diff --git a/src/bindings/python/src/openvino/runtime/opset5/__init__.py b/src/bindings/python/src/openvino/runtime/opset5/__init__.py index 6d68b3e8d9f9cb..202b8137093f57 100644 --- a/src/bindings/python/src/openvino/runtime/opset5/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset5/__init__.py @@ -2,151 +2,151 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset3.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset1.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset1.ops import detection_output -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset1.ops import gather -from openvino.runtime.opset5.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset2.ops import gelu -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset1.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset2.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset5.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset1.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset3.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset3.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset1.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset3.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset5.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset2.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset3.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset5/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset5/ops/__init__.py new file mode 100644 index 00000000000000..610622780c6f01 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset5/ops/__init__.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset5.ops import batch_norm_inference +from openvino.opset5.ops import gather_nd +from openvino.opset5.ops import gru_sequence +from openvino.opset5.ops import hsigmoid +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset5.ops import lstm_sequence +from openvino.opset5.ops import non_max_suppression +from openvino.opset5.ops import rnn_sequence +from openvino.opset5.ops import round diff --git a/src/bindings/python/src/openvino/runtime/opset6/__init__.py b/src/bindings/python/src/openvino/runtime/opset6/__init__.py index 2b2babb00c5021..315d80bc025a38 100644 --- a/src/bindings/python/src/openvino/runtime/opset6/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset6/__init__.py @@ -2,153 +2,153 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset1.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset1.ops import detection_output -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset1.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset5.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset2.ops import gelu -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset1.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset5.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset1.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset3.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset1.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset1.ops import divide +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset1.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset5.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset2.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset6/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset6/ops/__init__.py new file mode 100644 index 00000000000000..fb23fe96e5e51f --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset6/ops/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0from openvino.opset6.ops import assign + +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset6.ops import gather_elements +from openvino.opset6.ops import mvn +from openvino.opset6.ops import read_value diff --git a/src/bindings/python/src/openvino/runtime/opset7/__init__.py b/src/bindings/python/src/openvino/runtime/opset7/__init__.py index 9ee692cea75f59..16703072285487 100644 --- a/src/bindings/python/src/openvino/runtime/opset7/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset7/__init__.py @@ -2,157 +2,157 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset1.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset1.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset7.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset5.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset1.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset5.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset1.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset3.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset1.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset1.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset1.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset7.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset5.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset1.ops import interpolate +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset1.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset1.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset1.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset7/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset7/ops/__init__.py new file mode 100644 index 00000000000000..9517e795d355d3 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset7/ops/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset7.ops import dft +from openvino.opset7.ops import einsum +from openvino.opset7.ops import gather +from openvino.opset7.ops import gelu +from openvino.opset7.ops import idft +from openvino.opset7.ops import roll diff --git a/src/bindings/python/src/openvino/runtime/opset8/__init__.py b/src/bindings/python/src/openvino/runtime/opset8/__init__.py index bf6be68ca0cbc6..e45c37863b193b 100644 --- a/src/bindings/python/src/openvino/runtime/opset8/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset8/__init__.py @@ -2,168 +2,168 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset8.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset8.ops import multiclass_nms -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset5.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset3.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset1.ops import interpolate +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset8.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset5.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset3.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset8/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset8/ops/__init__.py new file mode 100644 index 00000000000000..053708521a6dae --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset8/ops/__init__.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset8.ops import deformable_convolution +from openvino.opset8.ops import detection_output +from openvino.opset8.ops import gather +from openvino.opset8.ops import gather_nd +from openvino.opset8.ops import if_op +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset8.ops import multiclass_nms +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset8.ops import prior_box +from openvino.opset8.ops import random_uniform +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax diff --git a/src/bindings/python/src/openvino/runtime/opset9/__init__.py b/src/bindings/python/src/openvino/runtime/opset9/__init__.py index 138ecdbdcd5d33..03051d46b58759 100644 --- a/src/bindings/python/src/openvino/runtime/opset9/__init__.py +++ b/src/bindings/python/src/openvino/runtime/opset9/__init__.py @@ -2,174 +2,174 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.opset1.ops import absolute -from openvino.runtime.opset1.ops import absolute as abs -from openvino.runtime.opset1.ops import acos -from openvino.runtime.opset4.ops import acosh -from openvino.runtime.opset8.ops import adaptive_avg_pool -from openvino.runtime.opset8.ops import adaptive_max_pool -from openvino.runtime.opset1.ops import add -from openvino.runtime.opset1.ops import asin -from openvino.runtime.opset4.ops import asinh -from openvino.runtime.opset6.ops import assign -from openvino.runtime.opset1.ops import atan -from openvino.runtime.opset4.ops import atanh -from openvino.runtime.opset1.ops import avg_pool -from openvino.runtime.opset5.ops import batch_norm_inference -from openvino.runtime.opset2.ops import batch_to_space -from openvino.runtime.opset1.ops import binary_convolution -from openvino.runtime.opset3.ops import broadcast -from openvino.runtime.opset3.ops import bucketize -from openvino.runtime.opset1.ops import ceiling -from openvino.runtime.opset1.ops import ceiling as ceil -from openvino.runtime.opset1.ops import clamp -from openvino.runtime.opset1.ops import concat -from openvino.runtime.opset1.ops import constant -from openvino.runtime.opset1.ops import convert -from openvino.runtime.opset1.ops import convert_like -from openvino.runtime.opset1.ops import convolution -from openvino.runtime.opset1.ops import convolution_backprop_data -from openvino.runtime.opset1.ops import cos -from openvino.runtime.opset1.ops import cosh -from openvino.runtime.opset1.ops import ctc_greedy_decoder -from openvino.runtime.opset6.ops import ctc_greedy_decoder_seq_len -from openvino.runtime.opset4.ops import ctc_loss -from openvino.runtime.opset3.ops import cum_sum -from openvino.runtime.opset3.ops import cum_sum as cumsum -from openvino.runtime.opset8.ops import deformable_convolution -from openvino.runtime.opset1.ops import deformable_psroi_pooling -from openvino.runtime.opset1.ops import depth_to_space -from openvino.runtime.opset8.ops import detection_output -from openvino.runtime.opset7.ops import dft -from openvino.runtime.opset1.ops import divide -from openvino.runtime.opset7.ops import einsum -from openvino.runtime.opset1.ops import elu -from openvino.runtime.opset3.ops import embedding_bag_offsets_sum -from openvino.runtime.opset3.ops import embedding_bag_packed_sum -from openvino.runtime.opset3.ops import embedding_segments_sum -from openvino.runtime.opset3.ops import extract_image_patches -from openvino.runtime.opset1.ops import equal -from openvino.runtime.opset1.ops import erf -from openvino.runtime.opset1.ops import exp -from openvino.runtime.opset9.ops import eye -from openvino.runtime.opset1.ops import fake_quantize -from openvino.runtime.opset1.ops import floor -from openvino.runtime.opset1.ops import floor_mod -from openvino.runtime.opset8.ops import gather -from openvino.runtime.opset6.ops import gather_elements -from openvino.runtime.opset8.ops import gather_nd -from openvino.runtime.opset1.ops import gather_tree -from openvino.runtime.opset7.ops import gelu -from openvino.runtime.opset9.ops import generate_proposals -from openvino.runtime.opset1.ops import greater -from openvino.runtime.opset1.ops import greater_equal -from openvino.runtime.opset9.ops import grid_sample -from openvino.runtime.opset1.ops import grn -from openvino.runtime.opset1.ops import group_convolution -from openvino.runtime.opset1.ops import group_convolution_backprop_data -from openvino.runtime.opset3.ops import gru_cell -from openvino.runtime.opset5.ops import gru_sequence -from openvino.runtime.opset1.ops import hard_sigmoid -from openvino.runtime.opset5.ops import hsigmoid -from openvino.runtime.opset4.ops import hswish -from openvino.runtime.opset7.ops import idft -from openvino.runtime.opset8.ops import if_op -from openvino.runtime.opset1.ops import interpolate -from openvino.runtime.opset9.ops import irdft -from openvino.runtime.opset8.ops import i420_to_bgr -from openvino.runtime.opset8.ops import i420_to_rgb -from openvino.runtime.opset1.ops import less -from openvino.runtime.opset1.ops import less_equal -from openvino.runtime.opset1.ops import log -from openvino.runtime.opset1.ops import logical_and -from openvino.runtime.opset1.ops import logical_not -from openvino.runtime.opset1.ops import logical_or -from openvino.runtime.opset1.ops import logical_xor -from openvino.runtime.opset5.ops import log_softmax -from openvino.runtime.opset5.ops import loop -from openvino.runtime.opset1.ops import lrn -from openvino.runtime.opset4.ops import lstm_cell -from openvino.runtime.opset5.ops import lstm_sequence -from openvino.runtime.opset1.ops import matmul -from openvino.runtime.opset8.ops import matrix_nms -from openvino.runtime.opset8.ops import max_pool -from openvino.runtime.opset1.ops import maximum -from openvino.runtime.opset1.ops import minimum -from openvino.runtime.opset4.ops import mish -from openvino.runtime.opset1.ops import mod -from openvino.runtime.opset9.ops import multiclass_nms -from openvino.runtime.opset1.ops import multiply -from openvino.runtime.opset6.ops import mvn -from openvino.runtime.opset1.ops import negative -from openvino.runtime.opset9.ops import non_max_suppression -from openvino.runtime.opset3.ops import non_zero -from openvino.runtime.opset1.ops import normalize_l2 -from openvino.runtime.opset1.ops import not_equal -from openvino.runtime.opset8.ops import nv12_to_bgr -from openvino.runtime.opset8.ops import nv12_to_rgb -from openvino.runtime.opset1.ops import one_hot -from openvino.runtime.opset1.ops import pad -from openvino.runtime.opset1.ops import parameter -from openvino.runtime.opset1.ops import power -from openvino.runtime.opset1.ops import prelu -from openvino.runtime.opset8.ops import prior_box -from openvino.runtime.opset1.ops import prior_box_clustered -from openvino.runtime.opset1.ops import psroi_pooling -from openvino.runtime.opset4.ops import proposal -from openvino.runtime.opset1.ops import range -from openvino.runtime.opset8.ops import random_uniform -from openvino.runtime.opset9.ops import rdft -from openvino.runtime.opset6.ops import read_value -from openvino.runtime.opset4.ops import reduce_l1 -from openvino.runtime.opset4.ops import reduce_l2 -from openvino.runtime.opset1.ops import reduce_logical_and -from openvino.runtime.opset1.ops import reduce_logical_or -from openvino.runtime.opset1.ops import reduce_max -from openvino.runtime.opset1.ops import reduce_mean -from openvino.runtime.opset1.ops import reduce_min -from openvino.runtime.opset1.ops import reduce_prod -from openvino.runtime.opset1.ops import reduce_sum -from openvino.runtime.opset1.ops import region_yolo -from openvino.runtime.opset2.ops import reorg_yolo -from openvino.runtime.opset1.ops import relu -from openvino.runtime.opset1.ops import reshape -from openvino.runtime.opset1.ops import result -from openvino.runtime.opset1.ops import reverse_sequence -from openvino.runtime.opset3.ops import rnn_cell -from openvino.runtime.opset5.ops import rnn_sequence -from openvino.runtime.opset9.ops import roi_align -from openvino.runtime.opset2.ops import roi_pooling -from openvino.runtime.opset7.ops import roll -from openvino.runtime.opset5.ops import round -from openvino.runtime.opset3.ops import scatter_elements_update -from openvino.runtime.opset4.ops import scatter_nd_update -from openvino.runtime.opset3.ops import scatter_update -from openvino.runtime.opset1.ops import select -from openvino.runtime.opset1.ops import selu -from openvino.runtime.opset3.ops import shape_of -from openvino.runtime.opset3.ops import shuffle_channels -from openvino.runtime.opset1.ops import sigmoid -from openvino.runtime.opset1.ops import sign -from openvino.runtime.opset1.ops import sin -from openvino.runtime.opset1.ops import sinh -from openvino.runtime.opset8.ops import slice -from openvino.runtime.opset8.ops import softmax -from openvino.runtime.opset4.ops import softplus -from openvino.runtime.opset9.ops import softsign -from openvino.runtime.opset2.ops import space_to_batch -from openvino.runtime.opset1.ops import space_to_depth -from openvino.runtime.opset1.ops import split -from openvino.runtime.opset1.ops import sqrt -from openvino.runtime.opset1.ops import squared_difference -from openvino.runtime.opset1.ops import squeeze -from openvino.runtime.opset1.ops import strided_slice -from openvino.runtime.opset1.ops import subtract -from openvino.runtime.opset4.ops import swish -from openvino.runtime.opset1.ops import tan -from openvino.runtime.opset1.ops import tanh -from openvino.runtime.opset1.ops import tensor_iterator -from openvino.runtime.opset1.ops import tile -from openvino.runtime.opset3.ops import topk -from openvino.runtime.opset1.ops import transpose -from openvino.runtime.opset1.ops import unsqueeze -from openvino.runtime.opset1.ops import variadic_split +from openvino.opset1.ops import absolute +from openvino.opset1.ops import absolute as abs +from openvino.opset1.ops import acos +from openvino.opset4.ops import acosh +from openvino.opset8.ops import adaptive_avg_pool +from openvino.opset8.ops import adaptive_max_pool +from openvino.opset1.ops import add +from openvino.opset1.ops import asin +from openvino.opset4.ops import asinh +from openvino.opset6.ops import assign +from openvino.opset1.ops import atan +from openvino.opset4.ops import atanh +from openvino.opset1.ops import avg_pool +from openvino.opset5.ops import batch_norm_inference +from openvino.opset2.ops import batch_to_space +from openvino.opset1.ops import binary_convolution +from openvino.opset3.ops import broadcast +from openvino.opset3.ops import bucketize +from openvino.opset1.ops import ceiling +from openvino.opset1.ops import ceiling as ceil +from openvino.opset1.ops import clamp +from openvino.opset1.ops import concat +from openvino.opset1.ops import constant +from openvino.opset1.ops import convert +from openvino.opset1.ops import convert_like +from openvino.opset1.ops import convolution +from openvino.opset1.ops import convolution_backprop_data +from openvino.opset1.ops import cos +from openvino.opset1.ops import cosh +from openvino.opset1.ops import ctc_greedy_decoder +from openvino.opset6.ops import ctc_greedy_decoder_seq_len +from openvino.opset4.ops import ctc_loss +from openvino.opset3.ops import cum_sum +from openvino.opset3.ops import cum_sum as cumsum +from openvino.opset8.ops import deformable_convolution +from openvino.opset1.ops import deformable_psroi_pooling +from openvino.opset1.ops import depth_to_space +from openvino.opset8.ops import detection_output +from openvino.opset7.ops import dft +from openvino.opset1.ops import divide +from openvino.opset7.ops import einsum +from openvino.opset1.ops import elu +from openvino.opset3.ops import embedding_bag_offsets_sum +from openvino.opset3.ops import embedding_bag_packed_sum +from openvino.opset3.ops import embedding_segments_sum +from openvino.opset3.ops import extract_image_patches +from openvino.opset1.ops import equal +from openvino.opset1.ops import erf +from openvino.opset1.ops import exp +from openvino.opset9.ops import eye +from openvino.opset1.ops import fake_quantize +from openvino.opset1.ops import floor +from openvino.opset1.ops import floor_mod +from openvino.opset8.ops import gather +from openvino.opset6.ops import gather_elements +from openvino.opset8.ops import gather_nd +from openvino.opset1.ops import gather_tree +from openvino.opset7.ops import gelu +from openvino.opset9.ops import generate_proposals +from openvino.opset1.ops import greater +from openvino.opset1.ops import greater_equal +from openvino.opset9.ops import grid_sample +from openvino.opset1.ops import grn +from openvino.opset1.ops import group_convolution +from openvino.opset1.ops import group_convolution_backprop_data +from openvino.opset3.ops import gru_cell +from openvino.opset5.ops import gru_sequence +from openvino.opset1.ops import hard_sigmoid +from openvino.opset5.ops import hsigmoid +from openvino.opset4.ops import hswish +from openvino.opset7.ops import idft +from openvino.opset8.ops import if_op +from openvino.opset1.ops import interpolate +from openvino.opset9.ops import irdft +from openvino.opset8.ops import i420_to_bgr +from openvino.opset8.ops import i420_to_rgb +from openvino.opset1.ops import less +from openvino.opset1.ops import less_equal +from openvino.opset1.ops import log +from openvino.opset1.ops import logical_and +from openvino.opset1.ops import logical_not +from openvino.opset1.ops import logical_or +from openvino.opset1.ops import logical_xor +from openvino.opset5.ops import log_softmax +from openvino.opset5.ops import loop +from openvino.opset1.ops import lrn +from openvino.opset4.ops import lstm_cell +from openvino.opset5.ops import lstm_sequence +from openvino.opset1.ops import matmul +from openvino.opset8.ops import matrix_nms +from openvino.opset8.ops import max_pool +from openvino.opset1.ops import maximum +from openvino.opset1.ops import minimum +from openvino.opset4.ops import mish +from openvino.opset1.ops import mod +from openvino.opset9.ops import multiclass_nms +from openvino.opset1.ops import multiply +from openvino.opset6.ops import mvn +from openvino.opset1.ops import negative +from openvino.opset9.ops import non_max_suppression +from openvino.opset3.ops import non_zero +from openvino.opset1.ops import normalize_l2 +from openvino.opset1.ops import not_equal +from openvino.opset8.ops import nv12_to_bgr +from openvino.opset8.ops import nv12_to_rgb +from openvino.opset1.ops import one_hot +from openvino.opset1.ops import pad +from openvino.opset1.ops import parameter +from openvino.opset1.ops import power +from openvino.opset1.ops import prelu +from openvino.opset8.ops import prior_box +from openvino.opset1.ops import prior_box_clustered +from openvino.opset1.ops import psroi_pooling +from openvino.opset4.ops import proposal +from openvino.opset1.ops import range +from openvino.opset8.ops import random_uniform +from openvino.opset9.ops import rdft +from openvino.opset6.ops import read_value +from openvino.opset4.ops import reduce_l1 +from openvino.opset4.ops import reduce_l2 +from openvino.opset1.ops import reduce_logical_and +from openvino.opset1.ops import reduce_logical_or +from openvino.opset1.ops import reduce_max +from openvino.opset1.ops import reduce_mean +from openvino.opset1.ops import reduce_min +from openvino.opset1.ops import reduce_prod +from openvino.opset1.ops import reduce_sum +from openvino.opset1.ops import region_yolo +from openvino.opset2.ops import reorg_yolo +from openvino.opset1.ops import relu +from openvino.opset1.ops import reshape +from openvino.opset1.ops import result +from openvino.opset1.ops import reverse_sequence +from openvino.opset3.ops import rnn_cell +from openvino.opset5.ops import rnn_sequence +from openvino.opset9.ops import roi_align +from openvino.opset2.ops import roi_pooling +from openvino.opset7.ops import roll +from openvino.opset5.ops import round +from openvino.opset3.ops import scatter_elements_update +from openvino.opset4.ops import scatter_nd_update +from openvino.opset3.ops import scatter_update +from openvino.opset1.ops import select +from openvino.opset1.ops import selu +from openvino.opset3.ops import shape_of +from openvino.opset3.ops import shuffle_channels +from openvino.opset1.ops import sigmoid +from openvino.opset1.ops import sign +from openvino.opset1.ops import sin +from openvino.opset1.ops import sinh +from openvino.opset8.ops import slice +from openvino.opset8.ops import softmax +from openvino.opset4.ops import softplus +from openvino.opset9.ops import softsign +from openvino.opset2.ops import space_to_batch +from openvino.opset1.ops import space_to_depth +from openvino.opset1.ops import split +from openvino.opset1.ops import sqrt +from openvino.opset1.ops import squared_difference +from openvino.opset1.ops import squeeze +from openvino.opset1.ops import strided_slice +from openvino.opset1.ops import subtract +from openvino.opset4.ops import swish +from openvino.opset1.ops import tan +from openvino.opset1.ops import tanh +from openvino.opset1.ops import tensor_iterator +from openvino.opset1.ops import tile +from openvino.opset3.ops import topk +from openvino.opset1.ops import transpose +from openvino.opset1.ops import unsqueeze +from openvino.opset1.ops import variadic_split diff --git a/src/bindings/python/src/openvino/runtime/opset9/ops/__init__.py b/src/bindings/python/src/openvino/runtime/opset9/ops/__init__.py new file mode 100644 index 00000000000000..a82d349508c058 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/opset9/ops/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from openvino.opset9.ops import eye +from openvino.opset9.ops import generate_proposals +from openvino.opset9.ops import grid_sample +from openvino.opset9.ops import irdft +from openvino.opset9.ops import multiclass_nms +from openvino.opset9.ops import non_max_suppression +from openvino.opset9.ops import rdft +from openvino.opset9.ops import roi_align +from openvino.opset9.ops import softsign diff --git a/src/bindings/python/src/openvino/runtime/passes/__init__.py b/src/bindings/python/src/openvino/runtime/passes/__init__.py index 19a28c7576decd..a74f91fdcfab2e 100644 --- a/src/bindings/python/src/openvino/runtime/passes/__init__.py +++ b/src/bindings/python/src/openvino/runtime/passes/__init__.py @@ -3,8 +3,8 @@ # type: ignore # flake8: noqa -from openvino._pyopenvino.passes import ModelPass, Matcher, MatcherPass, PassBase, WrapType, Or, AnyInput, Optional -from openvino._pyopenvino.passes import ( +from openvino.passes import ModelPass, Matcher, MatcherPass, PassBase, WrapType, Or, AnyInput, Optional +from openvino.passes import ( consumers_count, has_static_dim, has_static_dims, @@ -14,6 +14,6 @@ type_matches, type_matches_any, ) -from openvino._pyopenvino.passes import Serialize, ConstantFolding, VisualizeTree, MakeStateful, LowLatency2, ConvertFP32ToFP16, Version -from openvino.runtime.passes.manager import Manager -from openvino.runtime.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite +from openvino.passes import Serialize, ConstantFolding, VisualizeTree, MakeStateful, LowLatency2, ConvertFP32ToFP16, Version +from openvino.passes.manager import Manager +from openvino.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite diff --git a/src/bindings/python/src/openvino/runtime/passes/graph_rewrite/__init__.py b/src/bindings/python/src/openvino/runtime/passes/graph_rewrite/__init__.py new file mode 100644 index 00000000000000..a9690e891ff5e8 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/passes/graph_rewrite/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# type: ignore +from openvino.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite diff --git a/src/bindings/python/src/openvino/runtime/passes/manager/__init__.py b/src/bindings/python/src/openvino/runtime/passes/manager/__init__.py new file mode 100644 index 00000000000000..1a2674dd03c2b1 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/passes/manager/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# type: ignore +from openvino.passes.manager import Manager diff --git a/src/bindings/python/tests/test_graph/test_manager.py b/src/bindings/python/tests/test_graph/test_manager.py index ff72ef43158d6e..5101414228c06e 100644 --- a/src/bindings/python/tests/test_graph/test_manager.py +++ b/src/bindings/python/tests/test_graph/test_manager.py @@ -9,7 +9,7 @@ import openvino.runtime.opset10 as ops from openvino import Core, Model -from openvino.runtime.passes import Manager, Serialize, ConstantFolding, Version +from openvino.passes import Manager, Serialize, ConstantFolding, Version from tests.test_graph.util import count_ops_of_type from tests.utils.helpers import create_filenames_for_ir, compare_models @@ -48,6 +48,23 @@ def test_constant_folding(): assert np.allclose(values_out, values_expected) +def test_runtime_passes_manager(): + import openvino.runtime.passes as rt + node_constant = ops.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) + node_ceil = ops.ceiling(node_constant) + model = Model(node_ceil, [], "TestModel") + + assert count_ops_of_type(model, node_ceil) == 1 + assert count_ops_of_type(model, node_constant) == 1 + + pass_manager = rt.Manager() + pass_manager.register_pass(rt.ConstantFolding()) + pass_manager.run_passes(model) + + assert count_ops_of_type(model, node_ceil) == 0 + assert count_ops_of_type(model, node_constant) == 1 + + # request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request @pytest.fixture def prepare_ir_paths(request, tmp_path): diff --git a/src/bindings/python/tests/test_transformations/test_graph_rewrite.py b/src/bindings/python/tests/test_transformations/test_graph_rewrite.py index 4821dad33dff0a..5f5c100597adf2 100644 --- a/src/bindings/python/tests/test_transformations/test_graph_rewrite.py +++ b/src/bindings/python/tests/test_transformations/test_graph_rewrite.py @@ -2,7 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from openvino.runtime import opset8 -from openvino.runtime.passes import Manager, GraphRewrite, MatcherPass, WrapType, Matcher +from openvino.passes import Manager, GraphRewrite, MatcherPass, WrapType, Matcher from tests.test_transformations.utils.utils import count_ops, get_relu_model, PatternReplacement @@ -19,6 +19,19 @@ def test_graph_rewrite(): assert count_ops(model, "Relu") == [2] +def test_runtime_graph_rewrite(): + import openvino.runtime.passes as rt + model = get_relu_model() + + manager = rt.Manager() + # check that register pass returns pass instance + anchor = manager.register_pass(rt.GraphRewrite()) + anchor.add_matcher(PatternReplacement()) + manager.run_passes(model) + + assert count_ops(model, "Relu") == [2] + + def test_register_new_node(): class InsertExp(MatcherPass): def __init__(self): diff --git a/src/bindings/python/tests/test_transformations/test_manager.py b/src/bindings/python/tests/test_transformations/test_manager.py index e78c62d8c1a5c4..e8f113f30b381c 100644 --- a/src/bindings/python/tests/test_transformations/test_manager.py +++ b/src/bindings/python/tests/test_transformations/test_manager.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.passes import Manager, GraphRewrite, BackwardGraphRewrite, Serialize +from openvino.passes import Manager, GraphRewrite, BackwardGraphRewrite, Serialize from tests.test_transformations.utils.utils import MyModelPass, PatternReplacement, expect_exception diff --git a/src/bindings/python/tests/test_transformations/test_matcher_pass.py b/src/bindings/python/tests/test_transformations/test_matcher_pass.py index c32483be316658..8127e4b7612d56 100644 --- a/src/bindings/python/tests/test_transformations/test_matcher_pass.py +++ b/src/bindings/python/tests/test_transformations/test_matcher_pass.py @@ -2,7 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from openvino.runtime import opset8 -from openvino.runtime.passes import Manager, Matcher, MatcherPass, WrapType +from openvino.passes import Manager, Matcher, MatcherPass, WrapType from openvino.runtime.utils import replace_node from tests.test_transformations.utils.utils import count_ops, get_relu_model, PatternReplacement diff --git a/src/bindings/python/tests/test_transformations/test_model_pass.py b/src/bindings/python/tests/test_transformations/test_model_pass.py index 5df3d0a9024dc2..efc797535d8bb8 100644 --- a/src/bindings/python/tests/test_transformations/test_model_pass.py +++ b/src/bindings/python/tests/test_transformations/test_model_pass.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.passes import Manager +from openvino.passes import Manager from tests.test_transformations.utils.utils import get_relu_model, MyModelPass diff --git a/src/bindings/python/tests/test_transformations/test_pattern_ops.py b/src/bindings/python/tests/test_transformations/test_pattern_ops.py index 24b28061582c68..c445c281e47171 100644 --- a/src/bindings/python/tests/test_transformations/test_pattern_ops.py +++ b/src/bindings/python/tests/test_transformations/test_pattern_ops.py @@ -6,8 +6,8 @@ from openvino import PartialShape from openvino.runtime import opset13 as ops -from openvino.runtime.passes import Matcher, WrapType, Or, AnyInput, Optional -from openvino.runtime.passes import ( +from openvino.passes import Matcher, WrapType, Or, AnyInput, Optional +from openvino.passes import ( consumers_count, has_static_dim, has_static_dims, diff --git a/src/bindings/python/tests/test_transformations/test_public_transformations.py b/src/bindings/python/tests/test_transformations/test_public_transformations.py index a10fea786b9770..eac356cd1341f5 100644 --- a/src/bindings/python/tests/test_transformations/test_public_transformations.py +++ b/src/bindings/python/tests/test_transformations/test_public_transformations.py @@ -7,7 +7,7 @@ from openvino import Model, PartialShape, Shape, Core from openvino.runtime import opset13 as ops -from openvino.runtime.passes import ( +from openvino.passes import ( Manager, ConstantFolding, MakeStateful, diff --git a/src/bindings/python/tests/test_transformations/utils/utils.py b/src/bindings/python/tests/test_transformations/utils/utils.py index e0239ce05fdc9d..b5f09a68ff1511 100644 --- a/src/bindings/python/tests/test_transformations/utils/utils.py +++ b/src/bindings/python/tests/test_transformations/utils/utils.py @@ -4,7 +4,7 @@ from openvino import Model, PartialShape from openvino.runtime import opset13 as ops -from openvino.runtime.passes import ModelPass, Matcher, MatcherPass, WrapType +from openvino.passes import ModelPass, Matcher, MatcherPass, WrapType def get_relu_model(): diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp index 71b5f4ba6c6f96..4d3c9f95350f4b 100644 --- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp @@ -34,6 +34,10 @@ class SolveBufferMemory : public Pass { */ bool run(lowered::LinearIR& linear_ir) override; + // For the better performance data should be aligned with cache line size. + // The majority of CPUs have cache line size `64` bytes. + constexpr static size_t byte_alignment = 64; + private: using Buffers = std::vector; /** @@ -64,8 +68,6 @@ class SolveBufferMemory : public Pass { void set_dynamic_buffer_offset(const Buffers& dynamic_buffer_expressions); size_t& m_static_buffer_scratchpad_size; - - constexpr static size_t m_alignment = 32; // 32 bytes for data alignment in allocated memory }; } // namespace pass diff --git a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp index ca85cefd369099..ec7ab6c95eb89a 100644 --- a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp +++ b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp @@ -102,9 +102,8 @@ std::vector SolveBufferMemory::init_boxes(const Buffers& boxes.reserve(map_boxes.size()); for (auto& p : map_boxes) { auto& box = p.second; - // We use data alignment to put data in the line cache - // TODO [143395] : Please check if alignment is really needed here - box.size = utils::div_up(box.size, m_alignment); + // Align with cache line size. The experiments show that it affects performance. + box.size = utils::div_up(box.size, byte_alignment); boxes.push_back(box); } @@ -116,12 +115,12 @@ void SolveBufferMemory::solve_static_buffer_memory(const Buffers& static_buffer_ const auto boxes = init_boxes(static_buffer_expressions, linear_ir); ov::MemorySolver memSolver(boxes); - m_static_buffer_scratchpad_size = static_cast(memSolver.solve()) * m_alignment; // alignment in byte + m_static_buffer_scratchpad_size = static_cast(memSolver.solve()) * byte_alignment; // alignment in byte // Set offsets for Buffers for (const auto& buffer_expr : static_buffer_expressions) { const auto offset = static_cast(memSolver.get_offset(static_cast(buffer_expr->get_cluster_id()))); - buffer_expr->set_offset(offset * m_alignment); // alignment in byte + buffer_expr->set_offset(offset * byte_alignment); // alignment in byte } } diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 96d13074d042ba..06beb8db94ae3d 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -8,6 +8,8 @@ #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "snippets/lowered/pass/mha_parallel_wa_optimizer.hpp" +#include "snippets/lowered/pass/solve_buffer_memory.hpp" +#include "snippets/pass/split_dimension_m.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/utils/loop_utils.hpp" #include "snippets/utils/utils.hpp" @@ -228,7 +230,8 @@ void RuntimeConfigurator::update_loop_info(const lowered::LinearIRCPtr& linear_i void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRCPtr& linear_ir) const { const auto& loop_manager = linear_ir->get_loop_manager(); - m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size(); + // Align initial buffer scratchpad size with cache line size + m_config->buffer_scratchpad_size = utils::rnd_up(linear_ir->get_static_buffer_scratchpad_size(), lowered::pass::SolveBufferMemory::byte_alignment); auto is_not_executed = [&loop_manager](const lowered::ExpressionPtr& buffer_expr) { const auto& loop_ids = buffer_expr->get_loop_ids(); @@ -254,6 +257,9 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC additional_size = std::max(allocation_size * buffer_expr->get_node()->get_element_type().size(), additional_size); } + // Align with cache line size. The experiments shows that it affects performance. + additional_size = utils::rnd_up(additional_size, lowered::pass::SolveBufferMemory::byte_alignment); + cluster_offset = m_config->buffer_scratchpad_size; OPENVINO_ASSERT(!utils::is_dynamic_value(cluster_offset), "Offset of the cluster must be defined!"); m_config->buffer_scratchpad_size += additional_size; diff --git a/src/common/transformations/include/ov_ops/fully_connected.hpp b/src/common/transformations/include/ov_ops/fully_connected.hpp new file mode 100644 index 00000000000000..6f33b5963ffaf8 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnected : public ov::op::Op { +public: + OPENVINO_OP("FullyConnected", "ie_internal_opset"); + + FullyConnected() = default; + + FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::element::Type output_type = ov::element::undefined); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + ov::element::Type get_output_type() const { + return m_output_type; + } + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + void validate_and_infer_types() override; + +protected: + ov::element::Type m_output_type; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp b/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp new file mode 100644 index 00000000000000..d363a339406070 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected_compressed.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnectedCompressed : public FullyConnected { +public: + OPENVINO_OP("FullyConnectedCompressed", "ie_internal_opset", FullyConnected); + + FullyConnectedCompressed() = default; + + FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::element::Type output_type = ov::element::undefined); + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + void validate_and_infer_types() override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp new file mode 100644 index 00000000000000..6eceed0abdef78 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnectedQuantized : public FullyConnected { +public: + OPENVINO_OP("FullyConnectedQuantized", "ie_internal_opset", FullyConnected); + + FullyConnectedQuantized() = default; + + FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::Output& input_zero_points, + const ov::Output& output_scales, + const ov::Output& output_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp b/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp new file mode 100644 index 00000000000000..2c68ec4dc365f9 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected_quantized_legacy.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnectedQuantizedLegacy : public FullyConnected { +public: + OPENVINO_OP("FullyConnectedQuantizedLegacy", "ie_internal_opset", FullyConnected); + + FullyConnectedQuantizedLegacy() = default; + + FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::Output& deq_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::element::Type output_type = ov::element::undefined); + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + void validate_and_infer_types() override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/glu.hpp b/src/common/transformations/include/ov_ops/glu.hpp index 760641978b574d..add8c3a0582525 100644 --- a/src/common/transformations/include/ov_ops/glu.hpp +++ b/src/common/transformations/include/ov_ops/glu.hpp @@ -75,10 +75,6 @@ class TRANSFORMATIONS_API GLU : public ov::op::Op { ov::element::Type m_output_type{}; }; -// TODO 157615: Move to shape_inference -TRANSFORMATIONS_API std::vector shape_infer(const GLU* op, - std::vector input_shapes); - } // namespace internal } // namespace op } // namespace ov diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp new file mode 100644 index 00000000000000..1b6fcfb2bb3684 --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "ov_ops/fully_connected.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API ConvertFullyConnectedToFullyConnectedCompressed; + +} // namespace pass +} // namespace ov + +class ov::pass::ConvertFullyConnectedToFullyConnectedCompressed : public ov::pass::MatcherPass { +public: + using SupportsPredicate = + std::function&, size_t, size_t, size_t)>; + + OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0"); + ConvertFullyConnectedToFullyConnectedCompressed(const std::vector& supported_activation_types, + const std::vector& supported_weights_types, + SupportsPredicate supports_config = nullptr, + bool convert_u4zp_to_u8 = false); +}; diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp new file mode 100644 index 00000000000000..88990f92cb573c --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized_legacy.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/matcher_pass.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API ConvertFCToFCQuantizedLegacy; + +} // namespace pass +} // namespace ov + +class ov::pass::ConvertFCToFCQuantizedLegacy : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedQuantized", "0"); + ConvertFCToFCQuantizedLegacy(); +}; diff --git a/src/common/transformations/src/ov_ops/fully_connected.cpp b/src/common/transformations/src/ov_ops/fully_connected.cpp new file mode 100644 index 00000000000000..3fa609362b999c --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected.hpp" + +#include + +#include "matmul_shape_inference.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnected::FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::element::Type output_type) + : Op({A, B, bias}), + m_output_type(output_type) { + validate_and_infer_types(); +} + +FullyConnected::FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::element::Type output_type) + : FullyConnected(A, B, std::make_shared(element::undefined, Shape{0}), output_type) {} + +bool FullyConnected::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("output_type", m_output_type); + return true; +} + +std::shared_ptr FullyConnected::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), m_output_type); +} + +void FullyConnected::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, + input_size >= 3, + "Number of inputs is incorrect. Current value is: ", + input_size, + ", expected at least 3."); + + ov::op::v0::MatMul op; + op.set_transpose_a(false); + op.set_transpose_b(true); + + auto out_shapes = + ov::op::v0::shape_infer(&op, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp b/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp new file mode 100644 index 00000000000000..e0bb13042ea6ff --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected_compressed.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected_compressed.hpp" + +#include + +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); + set_argument(4, weight_zero_points); + validate_and_infer_types(); +} + +FullyConnectedCompressed::FullyConnectedCompressed(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::element::Type output_type) + : FullyConnectedCompressed(X, + W, + bias, + weight_scales, + std::make_shared(element::undefined, Shape{0}), + output_type) {} + +std::shared_ptr FullyConnectedCompressed::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + m_output_type); +} + +// @todo finalize validate_and_infer_types +void FullyConnectedCompressed::validate_and_infer_types() { + const auto input_size = get_input_size(); + + NODE_VALIDATION_CHECK(this, input_size == 5, "Number of inputs is incorrect. Current value is: ", input_size); + + FullyConnected::validate_and_infer_types(); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp new file mode 100644 index 00000000000000..3f06e14834f7d1 --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp @@ -0,0 +1,59 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected_quantized.hpp" + +#include "openvino/core/type/element_type.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnectedQuantized::FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::Output& input_zero_points, + const ov::Output& output_scales, + const ov::Output& output_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); + set_argument(4, weight_zero_points); + set_argument(5, input_scales); + set_argument(6, input_zero_points); + set_argument(7, output_scales); + set_argument(8, output_zero_points); + validate_and_infer_types(); +} + +std::shared_ptr FullyConnectedQuantized::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + new_args.at(5), + new_args.at(6), + new_args.at(7), + new_args.at(8), + m_output_type); +} + +// @todo finalize validate_and_infer_types +void FullyConnectedQuantized::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, input_size == 9, "Number of inputs is incorrect. Current value is: ", input_size); + + FullyConnected::validate_and_infer_types(); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp b/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp new file mode 100644 index 00000000000000..42df0980086199 --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected_quantized_legacy.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected_quantized_legacy.hpp" + +#include + +#include "matmul_shape_inference.hpp" +#include "openvino/core/type/element_type.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnectedQuantizedLegacy::FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::Output& deq_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, deq_scales); + set_argument(4, deq_zero_points); + validate_and_infer_types(); +} + +FullyConnectedQuantizedLegacy::FullyConnectedQuantizedLegacy(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& deq_scales, + const ov::element::Type output_type) + : FullyConnectedQuantizedLegacy(X, + W, + bias, + deq_scales, + std::make_shared(element::undefined, Shape{0}), + output_type) {} + +std::shared_ptr FullyConnectedQuantizedLegacy::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + m_output_type); +} + +// @todo finalize validate_and_infer_types +void FullyConnectedQuantizedLegacy::validate_and_infer_types() { + const auto input_size = get_input_size(); + + NODE_VALIDATION_CHECK(this, input_size == 5, "Number of inputs is incorrect. Current value is: ", input_size); + + ov::op::v0::MatMul op; + op.set_transpose_a(false); + op.set_transpose_b(true); + + auto out_shapes = + ov::op::v0::shape_infer(&op, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/glu.cpp b/src/common/transformations/src/ov_ops/glu.cpp index bc3dfb89ab8b9b..9b5fb780d36bb8 100644 --- a/src/common/transformations/src/ov_ops/glu.cpp +++ b/src/common/transformations/src/ov_ops/glu.cpp @@ -4,10 +4,9 @@ #include "ov_ops/glu.hpp" +#include "glu_shape_inference.hpp" #include "openvino/core/partial_shape.hpp" #include "openvino/core/validation_util.hpp" -#include "openvino/op/variadic_split.hpp" -#include "variadic_split_shape_inference.hpp" namespace ov { namespace op { @@ -38,11 +37,9 @@ bool GLU::visit_attributes(ov::AttributeVisitor& visitor) { void GLU::validate_and_infer_types() { auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; - std::vector input_shapes = {get_input_partial_shape(0), - ov::PartialShape(ov::Shape{}), - ov::PartialShape(ov::Shape{2})}; - - set_output_type(0, output_type, shape_infer(this, input_shapes)[0]); + const auto input_shapes = ov::util::get_node_input_partial_shapes(*this); + const auto output_shapes = shape_infer(this, input_shapes); + set_output_type(0, output_type, output_shapes[0]); } std::shared_ptr GLU::clone_with_new_inputs(const ov::OutputVector& new_args) const { @@ -54,21 +51,6 @@ std::shared_ptr GLU::clone_with_new_inputs(const ov::OutputVector& new_arg m_split_to_glu_idx, m_output_type); } - -std::vector shape_infer(const GLU* op, std::vector input_shapes) { - ov::op::v1::VariadicSplit variadic_split; - std::vector axis = {op->get_axis()}; - std::vector split_lengths = {op->get_split_lengths(), -1}; - - std::unordered_map const_data; - const_data.emplace(1, ov::Tensor(ov::element::i64, ov::Shape{}, static_cast(axis.data()))); - const_data.emplace( - 2, - ov::Tensor(ov::element::i64, ov::Shape{split_lengths.size()}, static_cast(split_lengths.data()))); - - return ov::op::v1::shape_infer(&variadic_split, input_shapes, ov::make_tensor_accessor(const_data)); -} - } // namespace internal } // namespace op } // namespace ov diff --git a/src/common/transformations/src/transformations/common_optimizations/nop_elimination.cpp b/src/common/transformations/src/transformations/common_optimizations/nop_elimination.cpp index 8b26d74dc2aac6..3883e94b74d33c 100644 --- a/src/common/transformations/src/transformations/common_optimizations/nop_elimination.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/nop_elimination.cpp @@ -132,6 +132,23 @@ static bool eliminate_nop(const shared_ptr& node) { return false; } +// Check if first dim is dynamic, other dims are static +static bool only_first_dim_dynamic(const PartialShape& pshape) { + if (pshape.rank().is_static() && pshape.size() > 0) { + if (pshape[0].is_dynamic()) { + for (size_t i = 1; i < pshape.size(); ++i) { + if (pshape[i].is_dynamic()) { + return false; + } + } + + return true; + } + } + + return false; +} + static bool eliminate_reshape_v1(const shared_ptr& node) { auto input = node->input_value(0); @@ -139,14 +156,17 @@ static bool eliminate_reshape_v1(const shared_ptr& node) { if (input.get_partial_shape().same_scheme(node->get_output_partial_shape(0))) return replace_output_update_name(node->output(0), input); } - // check if reshape is not identity op if (input.get_partial_shape().is_dynamic() || node->get_output_partial_shape(0).is_dynamic()) { - OPENVINO_DEBUG(node, " has dynamic shapes."); - return false; + if (!only_first_dim_dynamic(input.get_partial_shape()) || + !only_first_dim_dynamic(node->get_output_partial_shape(0))) { + OPENVINO_DEBUG(node, " has dynamic shapes with not only 0th dimension dynamic."); + return false; + } } + // remove identity op - if (input.get_shape() == node->get_output_shape(0)) { + if (input.get_partial_shape() == node->get_output_partial_shape(0)) { return replace_output_update_name(node->output(0), input); } // eliminate redundant reshape, squeeze, or unsqueeze @@ -156,15 +176,19 @@ static bool eliminate_reshape_v1(const shared_ptr& node) { if (input_node->get_output_target_inputs(0).size() != 1) return false; - auto shape = node->get_output_shape(0); + auto shape = node->get_output_partial_shape(0); // remove interchangeable nodes - if (input_node->get_input_partial_shape(0).is_static() && input_node->get_input_shape(0) == shape) { + if (input_node->get_input_partial_shape(0).is_static() && + input_node->get_input_partial_shape(0) == node->get_output_partial_shape(0)) { return replace_output_update_name(node->output(0), input_node->input_value(0)); } else { vector vi; - vi.assign(shape.begin(), shape.end()); - auto pat = ov::op::v0::Constant::create(element::i64, Shape{vi.size()}, vi); + vi.reserve(shape.size()); + for (const auto& dim : shape) { + vi.push_back(dim.is_dynamic() ? -1 : dim.get_length()); + } + auto pat = ov::op::v0::Constant::create(element::i64, Shape{shape.size()}, vi); auto new_reshape = make_shared(input.get_node()->input_value(0), pat, false); new_reshape->set_friendly_name(node->get_friendly_name()); copy_runtime_info({input_node, node}, new_reshape); diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp new file mode 100644 index 00000000000000..87c3b669d98c6d --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp @@ -0,0 +1,181 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_fc_to_compressed.hpp" + +#include +#include + +#include "openvino/core/rt_info.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_compressed.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed( + const std::vector& supported_activation_types, + const std::vector& supported_weights_types, + SupportsPredicate supports_config, + bool convert_u4zp_to_u8) { + using namespace ov::pass::pattern; + + auto reshape_3d_to_2d = [](const ov::Output& output) { + auto in_ps = output.get_node()->get_input_partial_shape(0); + auto out_ps = output.get_node()->get_output_partial_shape(0); + return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2; + }; + + auto activation_m = any_input(ov::pass::pattern::type_matches_any(supported_activation_types)); + auto weights_m = wrap_type(ov::pass::pattern::type_matches_any(supported_weights_types)); + auto convert_m = wrap_type({weights_m}); + + auto sub_const_m = wrap_type(); + auto sub_convert_const_m = wrap_type({sub_const_m}); + auto sub_with_convert_m = wrap_type({convert_m, sub_convert_const_m}); + auto sub_no_convert_m = wrap_type({convert_m, sub_const_m}); + auto subtract_m = std::make_shared(OutputVector{sub_with_convert_m, sub_no_convert_m}); + + auto mul_const_m = wrap_type(); + auto mul_convert_const_m = wrap_type({mul_const_m}); + auto mul_scale_m = std::make_shared(OutputVector{mul_const_m, mul_convert_const_m}); + + auto mul_with_sub_m = wrap_type({subtract_m, mul_scale_m}); + auto mul_no_sub_m = wrap_type({convert_m, mul_scale_m}); + auto mul_m = std::make_shared(OutputVector{mul_with_sub_m, mul_no_sub_m}); + + auto reshape_const_m = wrap_type(); + auto reshape_m = wrap_type({mul_m, reshape_const_m}, reshape_3d_to_2d); + + auto transpose_input = std::make_shared(OutputVector{reshape_m, mul_m}); + auto transpose_const_m = wrap_type(); + auto transpose_m = wrap_type({transpose_input, transpose_const_m}); + + auto bias_m = any_input(); + auto weights_input_m = std::make_shared(ov::OutputVector{reshape_m, transpose_m, mul_m}); + auto fully_connected_m = wrap_type({activation_m, weights_input_m, bias_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + OPENVINO_ASSERT(pattern_map.count(fully_connected_m)); + OPENVINO_ASSERT(pattern_map.count(mul_const_m)); + OPENVINO_ASSERT(pattern_map.count(weights_m)); + OPENVINO_ASSERT(pattern_map.count(bias_m)); + OPENVINO_ASSERT(pattern_map.count(convert_m)); + auto fc = std::dynamic_pointer_cast( + pattern_map.at(fully_connected_m).get_node_shared_ptr()); + if (!fc || transformation_callback(fc)) { + return false; + } + + bool has_transpose = pattern_map.count(transpose_m); + auto scale_shape = pattern_map.at(mul_const_m).get_shape(); + bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) { + return d > 1; + }) > 1; + + auto weights_shape = fc->get_input_shape(1); + const size_t IC = *(weights_shape.rbegin()); + const size_t OC = *(weights_shape.rbegin() + 1); + + const size_t G = grouped ? (has_transpose ? *(scale_shape.rbegin() + 2) : *(scale_shape.rbegin() + 1)) : 1; + + if (supports_config && !supports_config(fc, IC, OC, G)) + return false; + + auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr node) { + auto constant = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(constant != nullptr); + ov::Shape current_shape = constant->get_shape(); + if (current_shape.size() <= 2) + return constant; + + OPENVINO_ASSERT(current_shape.size() == 3); + + auto new_shape = (has_transpose || !grouped) + ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]} + : ov::Shape{current_shape[0], current_shape[1] * current_shape[2]}; + + return std::make_shared(*constant, new_shape); + }; + + auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr node) -> std::shared_ptr { + auto constant = std::dynamic_pointer_cast(node); + if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8) + return std::dynamic_pointer_cast(constant); + return std::make_shared(node, ov::element::u8); + }; + + const ov::Output& fc_input_a = fc->input_value(0); + const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr()); + std::shared_ptr optional_zero_point = nullptr; + + const bool with_zero_point = + pattern_map.count(sub_no_convert_m) > 0 || pattern_map.count(sub_with_convert_m) > 0; + if (with_zero_point) { + // WA: Convert ZP to u8 for OneDNN case to avoid u4 reorder + optional_zero_point = + convert_u4const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr())); + } + + std::shared_ptr fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr()); + std::shared_ptr fc_input_scale = scale; + std::shared_ptr fc_input_zp = optional_zero_point; + std::shared_ptr fc_input_bias = pattern_map.at(bias_m).get_node_shared_ptr(); + std::vector> result_nodes = {}; + if (has_transpose) { + const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr(); + std::shared_ptr transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr(); + if (ov::shape_size(transpose_const->get_shape()) != fc_input_b->get_output_partial_shape(0).size()) { + std::vector new_order(fc_input_b->get_output_partial_shape(0).size()); + std::iota(new_order.begin(), new_order.end(), 0); + std::swap(new_order[new_order.size() - 1], new_order[new_order.size() - 2]); + transpose_const = + std::make_shared(ov::element::i32, ov::Shape{new_order.size()}, new_order); + } + + fc_input_b = transpose->clone_with_new_inputs({fc_input_b->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_b); + result_nodes.push_back(fc_input_b); + fc_input_scale = transpose->clone_with_new_inputs({scale->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_scale); + result_nodes.push_back(fc_input_scale); + if (with_zero_point && ov::shape_size(optional_zero_point->output(0).get_shape()) > 1) { + fc_input_zp = transpose->clone_with_new_inputs({optional_zero_point->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_zp); + result_nodes.push_back(fc_input_zp); + } + } + + fc_input_zp = + with_zero_point ? fc_input_zp : std::make_shared(element::undefined, Shape{0}); + ov::disable_constant_folding(fc_input_zp); + result_nodes.push_back(fc_input_zp); + + auto new_fc = std::make_shared(fc_input_a, + fc_input_b, + fc_input_bias, + fc_input_scale, + fc_input_zp, + fc->get_output_type()); + + result_nodes.push_back(new_fc); + new_fc->set_friendly_name(fc->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), result_nodes); + ov::replace_node(fc, new_fc); + return true; + }; + + auto m = std::make_shared(fully_connected_m, + "ConvertFullyConnectedToFullyConnectedCompressed"); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp new file mode 100644 index 00000000000000..908e36a51a7eb9 --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized_legacy.cpp @@ -0,0 +1,77 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_fc_to_quantized_legacy.hpp" + +#include + +#include "openvino/core/rt_info.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertFCToFCQuantizedLegacy::ConvertFCToFCQuantizedLegacy() { + using namespace ov::pass::pattern; + + std::vector activation_types{ov::element::u8, ov::element::i8}; + std::vector weights_types{ov::element::i8}; + + auto activations_m = pattern::any_input(ov::pass::pattern::type_matches_any(activation_types)); + auto weights_m = wrap_type(ov::pass::pattern::type_matches_any(weights_types)); + auto bias_m = pattern::any_input(); + + auto fully_connected_m = wrap_type({activations_m, weights_m, bias_m}); + auto dequantization_scales_m = wrap_type(); + auto multiply_m = wrap_type({fully_connected_m, dequantization_scales_m}); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto fc_output = pattern_map.at(fully_connected_m); + auto activations = pattern_map.at(activations_m); + auto weights = pattern_map.at(weights_m); + auto bias = pattern_map.at(bias_m); + auto multiply = pattern_map.at(multiply_m); + auto dequantization_scales = pattern_map.at(dequantization_scales_m); + const auto& fc_output_shape = fc_output.get_partial_shape(); + const auto& multiply_output_shape = multiply.get_partial_shape(); + + if (*fc_output_shape.rbegin() != *multiply_output_shape.rbegin()) { + return false; + } + + auto fc_node = std::dynamic_pointer_cast( + pattern_map.at(fully_connected_m).get_node_shared_ptr()); + + ov::NodeVector new_ops; + auto zp = std::make_shared(element::undefined, Shape{0}); + new_ops.push_back(zp); + + auto fc_quantized = + std::make_shared(activations, + weights, + bias, + dequantization_scales, + zp, + fc_node->get_output_type()); + new_ops.push_back(fc_quantized); + + const auto& multiply_node = multiply.get_node_shared_ptr(); + fc_quantized->set_friendly_name(multiply_node->get_friendly_name()); + + ov::copy_runtime_info({multiply_node, fc_node}, new_ops); + ov::replace_node(multiply_node, fc_quantized); + + return true; + }; + + auto m = std::make_shared(multiply_m, "ConvertFullyConnectedToFullyConnectedQuantized"); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/tests/common_optimizations/nop_elimination.cpp b/src/common/transformations/tests/common_optimizations/nop_elimination.cpp index 19b5fefd79b9b0..1245bd26e0d3b2 100644 --- a/src/common/transformations/tests/common_optimizations/nop_elimination.cpp +++ b/src/common/transformations/tests/common_optimizations/nop_elimination.cpp @@ -233,6 +233,38 @@ TEST(nop_elimination, squeeze_unsqueeze_elimination_dynamic_without_squeeze_axis EXPECT_NO_THROW(pass_manager.run_passes(f)); } +TEST_F(TransformationTestsF, reshape_reshape_elimination_v1_dynamic) { + { + auto input = make_shared(element::f32, PartialShape({-1, 32, 1, 128})); + + auto top_reshape_const = op::v0::Constant::create(element::i32, Shape{4}, {-1, 32, 1, 128}); + auto top_reshape = std::make_shared(input, top_reshape_const, false); + + auto bottom_reshape_const = op::v0::Constant::create(element::i32, Shape{2}, {-1, 4096}); + auto bottom_reshape = std::make_shared(top_reshape, bottom_reshape_const, false); + + auto add_param = make_shared(element::f32, PartialShape({-1, 4096})); + auto add = std::make_shared(bottom_reshape, add_param); + model = std::make_shared(NodeVector{add}, ParameterVector{input, add_param}); + } + { + auto input = make_shared(element::f32, PartialShape({-1, 32, 1, 128})); + + auto bottom_reshape_const = op::v0::Constant::create(element::i32, Shape{2}, {-1, 4096}); + auto bottom_reshape = std::make_shared(input, bottom_reshape_const, false); + + auto add_param = make_shared(element::f32, PartialShape({-1, 4096})); + auto add = std::make_shared(bottom_reshape, add_param); + model_ref = std::make_shared(NodeVector{add}, ParameterVector{input, add_param}); + } + + manager.register_pass(); + manager.run_passes(model); + + auto res = comparator.compare(model, model_ref); + ASSERT_TRUE(res.valid) << res.message; +} + TEST(nop_elimination, reshape_elimination_v1_dynamic_negative) { auto arg = std::make_shared(element::i64, PartialShape::dynamic()); auto pattern = make_shared(element::i64, PartialShape::dynamic(1)); diff --git a/src/common/transformations/tests/control_flow/unroll_if_test.cpp b/src/common/transformations/tests/control_flow/unroll_if_test.cpp index 9c3ac5ea677802..28a2315ca60c0c 100644 --- a/src/common/transformations/tests/control_flow/unroll_if_test.cpp +++ b/src/common/transformations/tests/control_flow/unroll_if_test.cpp @@ -23,9 +23,15 @@ #include "transformations/init_node_info.hpp" #include "transformations/rt_info/fused_names_attribute.hpp" -using namespace ov; using namespace testing; +namespace ov { +namespace test { +using op::v0::Constant; +using op::v0::Parameter; +using op::v0::Result; +using op::v1::Add; + std::shared_ptr get_then_body() { auto Xt = std::make_shared(ov::element::f32, ov::Shape{3}); Xt->set_friendly_name("Xt"); @@ -350,3 +356,60 @@ TEST(TransformationTests, UnrollIfInsideIf) { auto res = compare_functions(f, f_ref); ASSERT_TRUE(res.first) << res.second; } + +TEST(TransformationTests, UnrollIfToParameterResultModel) { + constexpr auto et = element::f32; + std::shared_ptr model, model_ref; + + { + const auto a = std::make_shared(et, PartialShape{5, 7}); + const auto b = std::make_shared(et, PartialShape{1}); + const auto c = std::make_shared(et, PartialShape{5, 7}); + + const auto then_add = std::make_shared(a, b); + auto then_result = std::make_shared(then_add); + auto else_result = std::make_shared(c); + + const auto then_body = std::make_shared(OutputVector{then_result}, ParameterVector{a, b}); + const auto else_body = std::make_shared(OutputVector{else_result}, ParameterVector{c}); + + const auto if_input_0 = std::make_shared(et, a->get_output_partial_shape(0)); + const auto if_input_1 = std::make_shared(et, b->get_output_partial_shape(0)); + const auto condition = Constant::create(element::boolean, {1}, {false}); + const auto if_op = std::make_shared(condition); + if_op->set_then_body(then_body); + if_op->set_else_body(else_body); + if_op->set_input(if_input_0, a, c); + if_op->set_input(if_input_1, b, nullptr); + const auto if_result = if_op->set_output(then_result, else_result); + + const auto results = ResultVector{std::make_shared(if_result)}; + model = std::make_shared(results, ParameterVector{if_input_0, if_input_1}, "simple_if"); + model->input(0).set_names({"Input.0"}); + model->input(1).set_names({"Input.1"}); + model->output(0).set_names({"Output"}); + + pass::Manager manager; + manager.register_pass(); + manager.register_pass(); + manager.run_passes(model); + + OV_ASSERT_NO_THROW(check_rt_info(model)); + } + { + const auto p = std::make_shared(et, PartialShape{5, 7}); + const auto r = std::make_shared(p); + model_ref = std::make_shared(ResultVector{r}, ParameterVector{p}, "simple_if"); + model_ref->input(0).set_names({"Input.0"}); + model_ref->output(0).set_names({"Output"}); + } + + const auto cmp_result = compare_functions(model, model_ref); + ASSERT_TRUE(cmp_result.first) << cmp_result.second; + + EXPECT_THAT(model->input(0).get_names(), UnorderedElementsAre("Input.0", "Output")); + EXPECT_THAT(model->output(0).get_names(), UnorderedElementsAre("Output")); +} + +} // namespace test +} // namespace ov diff --git a/src/core/dev_api/openvino/core/descriptor_tensor.hpp b/src/core/dev_api/openvino/core/descriptor_tensor.hpp index 9418183a1189fc..cdd9ba4c2bbab8 100644 --- a/src/core/dev_api/openvino/core/descriptor_tensor.hpp +++ b/src/core/dev_api/openvino/core/descriptor_tensor.hpp @@ -1,12 +1,21 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#pragma once -#include "openvino/core/descriptor/tensor.hpp" +#include +#include + +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/type/element_type.hpp" namespace ov { namespace descriptor { +class Tensor; +class Input; +class Output; + // To change Tensor element type please change the Parameter type. OPENVINO_API void set_element_type(Tensor& tensor, const element::Type& elemenet_type); @@ -14,5 +23,70 @@ void set_element_type(Tensor& tensor, const element::Type& elemenet_type); // To change Tensor type please change the Parameter type. OPENVINO_API void set_tensor_type(Tensor& tensor, const element::Type& element_type, const PartialShape& pshape); + +/** + * @brief Set destination tensor names as copy of all names from source tensor all tensor names. + * + * @param dst The tensor descriptor to set names. + * @param src The tensor descriptor as from which names will be copied. + */ +OPENVINO_API +void copy_tensor_names(Tensor& dst, const Tensor& src); + +/** @brief Tensor descriptor interface. */ +class OPENVINO_API ITensorDescriptor { +public: + virtual const element::Type& get_element_type() const = 0; + virtual const PartialShape& get_partial_shape() const = 0; + virtual const Shape& get_shape() const = 0; + virtual void set_type_shape(const element::Type& et, const PartialShape& shape) = 0; + + virtual void set_names(const std::unordered_set& names) = 0; + virtual void add_names(const std::unordered_set& names) = 0; + virtual const std::unordered_set& get_names() const = 0; + virtual const std::unordered_set& get_all_names() const = 0; + virtual const std::string& get_any_name() const = 0; + + virtual RTMap& rt_map() = 0; + virtual const RTMap& rt_map() const = 0; + virtual size_t pointer_hash() const noexcept = 0; + +protected: + virtual ~ITensorDescriptor(); +}; + +/** @brief The TensorExtension defines developer API for ov::descriptor::Tensor. */ +struct OPENVINO_API TensorExtension { + /** + * @brief Get the tensor descriptor object + * + * @param tensor Tensor descriptor to access its implementation. + * @return Reference to Tensor description implementation. + */ + static const ITensorDescriptor& get_descriptor(const Tensor& tensor); + static std::shared_ptr& get_descriptor_ptr(Tensor& tensor); + + /** + * @brief The hasher of shared pointer Tensor descriptor. + */ + struct OPENVINO_API Hasher { + size_t operator()(const std::shared_ptr& tensor) const; + }; + + /** + * @brief The comparator of shared pointer Tensor descriptor. + */ + struct OPENVINO_API Equal { + bool operator()(const std::shared_ptr& lhs, const std::shared_ptr& rhs) const; + }; +}; + +/** + * @brief Set input descriptor as shared tensor on output descriptor. + * + * @param output_descriptor Descriptor to set shared tensor. + * @param input_descriptor Input descriptor to set in output as shared tensor. + */ +OPENVINO_API void set_shared_tensor(Output& output_descriptor, const Input& input_descriptor); } // namespace descriptor } // namespace ov diff --git a/src/core/include/openvino/core/any.hpp b/src/core/include/openvino/core/any.hpp index 9badb007d526b9..e002756d361f1f 100644 --- a/src/core/include/openvino/core/any.hpp +++ b/src/core/include/openvino/core/any.hpp @@ -485,6 +485,7 @@ class OPENVINO_API Any { using Ptr = std::shared_ptr; virtual const std::type_info& type_info() const = 0; virtual std::vector base_type_info() const = 0; + bool is_base_type_info(const std::type_info& type_info) const; virtual const void* addressof() const = 0; void* addressof() { return const_cast(const_cast(this)->addressof()); @@ -506,6 +507,9 @@ class OPENVINO_API Any { std::string to_string() const; bool is(const std::type_info& other) const; + bool is_signed_integral() const; + bool is_unsigned_integral() const; + bool is_floating_point() const; template bool is() const { @@ -514,17 +518,24 @@ class OPENVINO_API Any { template T& as() & { - type_check(typeid(decay_t)); return *static_cast*>(addressof()); } template const T& as() const& { - type_check(typeid(decay_t)); return *static_cast*>(addressof()); } + template + T convert() const; + protected: + template + [[noreturn]] U convert_impl() const; + + template + U convert_impl() const; + virtual ~Base() = default; }; @@ -685,6 +696,92 @@ class OPENVINO_API Any { T value; }; + // Generic if there is no specialization for T. + template + T& as_impl(...) { + impl_check(); + if (is()) { + return _impl->as(); + } + + OPENVINO_THROW("Bad as from: ", _impl->type_info().name(), " to: ", typeid(T).name()); + } + + template ::value>::type* = nullptr> + T& as_impl(int) { + if (_impl != nullptr) { + if (_impl->is()) { + return _impl->as(); + } else { + _temp = std::make_shared>(); + _impl->read_to(*_temp); + return _temp->as(); + } + } else { + _temp = std::make_shared>(); + return _temp->as(); + } + } + + template < + class T, + typename std::enable_if>::value>::type* = nullptr> + T& as_impl(int) { + if (_impl == nullptr) { + _temp = std::make_shared>>(T{}); + return _temp->as(); + } else { + if (_impl->is()) { + return _impl->as(); + } else { + auto runtime_attribute = _impl->as_runtime_attribute(); + if (runtime_attribute == nullptr) { + OPENVINO_THROW("Any does not contains pointer to runtime_attribute. It contains ", + _impl->type_info().name()); + } + auto vptr = std::dynamic_pointer_cast(runtime_attribute); + if (vptr == nullptr && T::element_type::get_type_info_static() != runtime_attribute->get_type_info() && + T::element_type::get_type_info_static() != RuntimeAttribute::get_type_info_static()) { + OPENVINO_THROW("Could not as Any runtime_attribute to ", + typeid(T).name(), + " from ", + _impl->type_info().name(), + "; from ", + static_cast(runtime_attribute->get_type_info()), + " to ", + static_cast(T::element_type::get_type_info_static())); + } + _temp = std::make_shared>>( + std::static_pointer_cast(runtime_attribute)); + return _temp->as(); + } + } + } + + template ::value && + !std::is_same::type, bool>::value>::type* = nullptr> + T& as_impl(int); + + template ::value || util::Readable::value) && !std::is_same::value && + (!std::is_arithmetic::value || std::is_same::type, bool>::value)>::type* = + nullptr> + T& as_impl(int) { + impl_check(); + + if (is()) { + return _impl->as(); + } else if (_impl->is()) { + _temp = std::make_shared>>(); + _impl->read_to(*_temp); + return _temp->as(); + } + + OPENVINO_THROW("Bad as from: ", _impl->type_info().name(), " to: ", typeid(T).name()); + } + friend class ::ov::RuntimeAttribute; friend class ::ov::CompiledModel; friend class ::ov::proxy::CompiledModel; @@ -704,11 +801,11 @@ class OPENVINO_API Any { /// @brief Default constructor Any() = default; - /// @brief Сopy constructor + /// @brief Copy constructor /// @param other other Any object Any(const Any& other); - /// @brief Сopy assignment operator + /// @brief Copy assignment operator /// @param other other Any object /// @return reference to the current object Any& operator=(const Any& other); @@ -756,8 +853,8 @@ class OPENVINO_API Any { * @brief Inplace value construction function * * @tparam T Any type - * @tparam Args pack of paramter types passed to T constructor - * @param args pack of paramters passed to T constructor + * @tparam Args pack of parameter types passed to T constructor + * @param args pack of parameters passed to T constructor */ template static Any make(Args&&... args) { @@ -786,130 +883,21 @@ class OPENVINO_API Any { */ template bool is() const { - if (_impl != nullptr) { - if (_impl->is(typeid(decay_t))) { - return true; - } - for (const auto& type_index : _impl->base_type_info()) { - if (util::equal(type_index, typeid(decay_t))) { - return true; - } - } - } - return false; - } - - /** - * Dynamic cast to specified type - * @tparam T type - * @return casted object - */ - template - typename std::enable_if>::value, T>::type& as() { - if (_impl == nullptr) { - _temp = std::make_shared>>(T{}); - return *static_cast*>(_temp->addressof()); - } else { - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } else { - auto runtime_attribute = _impl->as_runtime_attribute(); - if (runtime_attribute == nullptr) { - OPENVINO_THROW("Any does not contains pointer to runtime_attribute. It contains ", - _impl->type_info().name()); - } - auto vptr = std::dynamic_pointer_cast(runtime_attribute); - if (vptr == nullptr && T::element_type::get_type_info_static() != runtime_attribute->get_type_info() && - T::element_type::get_type_info_static() != RuntimeAttribute::get_type_info_static()) { - OPENVINO_THROW("Could not cast Any runtime_attribute to ", - typeid(T).name(), - " from ", - _impl->type_info().name(), - "; from ", - static_cast(runtime_attribute->get_type_info()), - " to ", - static_cast(T::element_type::get_type_info_static())); - } - _temp = std::make_shared>>( - std::static_pointer_cast(runtime_attribute)); - return *static_cast*>(_temp->addressof()); - } - } - } - - /** - * Dynamic cast to specified type - * @tparam T type - * @return casted object - */ - template - typename std::enable_if>::value && - !std::is_same::value && std::is_default_constructible::value && - (util::Istreamable::value || util::Readable::value), - T>::type& - as() { - impl_check(); - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } else if (_impl->is(typeid(std::string))) { - _temp = std::make_shared>>(); - _impl->read_to(*_temp); - return *static_cast*>(_temp->addressof()); - } - for (const auto& type_index : _impl->base_type_info()) { - if (util::equal(type_index, typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } - } - OPENVINO_THROW("Bad cast from: ", _impl->type_info().name(), " to: ", typeid(T).name()); - } - - /** - * Dynamic cast to specified type - * @tparam T type - * @return casted object - */ - template - typename std::enable_if< - !std::is_convertible>::value && !std::is_same::value && - (!std::is_default_constructible::value || (!util::Istreamable::value && !util::Readable::value)), - T>::type& - as() { - impl_check(); - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } - for (const auto& type_index : _impl->base_type_info()) { - if (util::equal(type_index, typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } - } - OPENVINO_THROW("Bad cast from: ", _impl->type_info().name(), " to: ", typeid(T).name()); + return _impl && (_impl->is() || _impl->is_base_type_info(typeid(decay_t))); } /** - * Dynamic cast to specified type + * Dynamic as to specified type * @tparam T type - * @return casted object + * @return reference to caster object */ template - typename std::enable_if::value, T>::type& as() { - if (_impl != nullptr) { - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } else { - _temp = std::make_shared>(); - _impl->read_to(*_temp); - return *static_cast(_temp->addressof()); - } - } else { - _temp = std::make_shared>(); - return *static_cast(_temp->addressof()); - } + T& as() { + return as_impl(int{}); } /** - * Dynamic cast to specified type + * Dynamic as to specified type * @tparam T type * @return const reference to caster object */ @@ -983,4 +971,40 @@ inline static void PrintTo(const Any& any, std::ostream* os) { } /** @endcond */ +template <> +OPENVINO_API unsigned long long Any::Base::convert() const; + +template <> +OPENVINO_API long long Any::Base::convert() const; + +template <> +OPENVINO_API double Any::Base::convert() const; + +template ::value && + !std::is_same::type, bool>::value>::type*> +T& Any::as_impl(int) { + impl_check(); + if (is()) { + return _impl->as(); + } else if (util::Readable::value && _impl->is()) { + _temp = std::make_shared>>(); + _impl->read_to(*_temp); + return _temp->as(); + } else if (_impl->is_signed_integral()) { + auto value = _impl->convert(); + _temp = std::make_shared>>(static_cast(value)); + return _temp->as(); + } else if (_impl->is_unsigned_integral()) { + auto value = _impl->convert(); + _temp = std::make_shared>>(static_cast(value)); + return _temp->as(); + } else if (_impl->is_floating_point()) { + auto value = _impl->convert(); + _temp = std::make_shared>>(static_cast(value)); + return _temp->as(); + } + + OPENVINO_THROW("Bad as from: ", _impl->type_info().name(), " to: ", typeid(T).name()); +} } // namespace ov diff --git a/src/core/include/openvino/core/descriptor/input.hpp b/src/core/include/openvino/core/descriptor/input.hpp index cbedde6d73a118..2e1335a09ee5c6 100644 --- a/src/core/include/openvino/core/descriptor/input.hpp +++ b/src/core/include/openvino/core/descriptor/input.hpp @@ -13,18 +13,13 @@ namespace ov { class Node; -namespace op { -namespace v0 { -class Result; -} // namespace v0 -} // namespace op + namespace descriptor { class Output; // Describes a tensor that is an input to an op, directly or indirectly via a tuple class OPENVINO_API Input { friend class ov::Node; - friend class ov::op::v0::Result; public: /// \param node The node that owns this input @@ -111,12 +106,6 @@ class OPENVINO_API Input { Input& operator=(const Input&) = default; protected: - /// \return the tensor for the connected output - std::shared_ptr get_tensor_ptr() const; - - /// \return the tensor for the connected output - std::shared_ptr get_tensor_ptr(); - // owner of an argument node (in lieu of m_arguments) std::shared_ptr m_src_node; Node* m_node; // The node we are an input for diff --git a/src/core/include/openvino/core/descriptor/tensor.hpp b/src/core/include/openvino/core/descriptor/tensor.hpp index 9624994d8d612e..13a3826c37f291 100644 --- a/src/core/include/openvino/core/descriptor/tensor.hpp +++ b/src/core/include/openvino/core/descriptor/tensor.hpp @@ -22,99 +22,107 @@ namespace ov { class Node; /// \brief Alias for symbol tensor. using TensorSymbol = std::vector>; -/// \brief Alias for vector of symbol tensors. +/// \brief Alias for vector of symbol tensors. using TensorSymbolVector = std::vector; -namespace pass { -class ReverseShapeAndTypeInfer; -} namespace descriptor { - -class Tensor; +class ITensorDescriptor; /// \brief Compile-time descriptor of a first-class value that is a tensor. class OPENVINO_API Tensor { public: + /// \brief Creates Tensor descriptor + /// \param element_type Element type + /// \param pshape Partial shape of tensor + /// \param names Tensor names (optional default empty). Tensor(const element::Type& element_type, const PartialShape& pshape, const std::unordered_set& names = {}); + + OPENVINO_DEPRECATED("This constructor is deprecated. Will be removed in 2026.0") Tensor(const element::Type& element_type, const PartialShape& pshape, Node* node, size_t node_output_number); Tensor(const Tensor&) = delete; Tensor& operator=(const Tensor&) = delete; + /// \brief Gets any tensor name. + /// Throws if tensor has no names. const std::string& get_any_name() const; + + /// \brief Gets tensor names const std::unordered_set& get_names() const; + + /// \brief Set new names. + /// \param names Names to set. void set_names(const std::unordered_set& names); + + /// \brief Adds new names to tensor. + /// \param names new names to be added. void add_names(const std::unordered_set& names); /// \brief sets lower bound value description void set_lower_value(const ov::Tensor& value); + /// \brief sets upper bound value description void set_upper_value(const ov::Tensor& value); + /// \brief sets value symbol description void set_value_symbol(const TensorSymbol& value_symbol); + /// \brief unsets bound value descriptions void invalidate_values(); - const element::Type& get_element_type() const { - return m_element_type; - } + /// \brief Gets element type. + const element::Type& get_element_type() const; + + /// \brief Gets shape. + /// Throw if Tensor's shape is not static. const Shape& get_shape() const; - const PartialShape& get_partial_shape() const { - return m_partial_shape; - } + + /// \brief Gets partial shape. + const PartialShape& get_partial_shape() const; + /// \brief gets lower bound value description - const ov::Tensor& get_lower_value() const { - return m_lower_value; - } + const ov::Tensor& get_lower_value() const; + /// \brief gets upper bound value description - const ov::Tensor& get_upper_value() const { - return m_upper_value; - } + const ov::Tensor& get_upper_value() const; + /// \brief gets symbol value description - TensorSymbol get_value_symbol() const { - return m_value_symbol; - } + TensorSymbol get_value_symbol() const; + /// \brief checks if lower and upper bound are set and point to the same Tensor - bool has_and_set_bound() const { - return m_upper_value && m_lower_value && m_upper_value.data() == m_lower_value.data(); - } + bool has_and_set_bound() const; + + /// \brief Get Tensor size in bytes. + /// \return Size in bytes. size_t size() const; - RTMap& get_rt_info() { - return m_rt_info; - } - const RTMap& get_rt_info() const { - return m_rt_info; - } + /// \brief Gets runtime informations. + /// \return Runtime information map which can be modified. + RTMap& get_rt_info(); - void clone_from(const Tensor& old); + /// \brief Gets runtime informations. + /// \return Read only runtime information map. + const RTMap& get_rt_info() const; -protected: - element::Type m_element_type; + /// \brief Clones Tensor from the other. + /// \param other Tensor used to clone its properties. + void clone_from(const Tensor& other); - PartialShape m_partial_shape; +protected: ov::Tensor m_lower_value, m_upper_value; TensorSymbol m_value_symbol; - - std::unordered_set m_names; - std::unordered_set::const_iterator m_name_it; - RTMap m_rt_info; - - friend OPENVINO_API void set_element_type(Tensor& tensor, const element::Type& elemenet_type); - friend OPENVINO_API void set_tensor_type(Tensor& tensor, - const element::Type& element_type, - const PartialShape& pshape); + std::shared_ptr m_impl; private: - mutable std::atomic m_shape_changing{false}; - mutable bool m_shape_changed{true}; - mutable Shape m_shape; + // hidden extension API for Tensor descriptor + friend struct TensorExtension; }; OPENVINO_API std::ostream& operator<<(std::ostream&, const ov::descriptor::Tensor&); } // namespace descriptor + } // namespace ov diff --git a/src/core/include/openvino/core/except.hpp b/src/core/include/openvino/core/except.hpp index fdb3746d323350..a923cd98c7e576 100644 --- a/src/core/include/openvino/core/except.hpp +++ b/src/core/include/openvino/core/except.hpp @@ -62,6 +62,7 @@ class OPENVINO_API AssertFailure : public Exception { const char* check_string, const std::string& context_info, const std::string& explanation); + virtual ~AssertFailure(); protected: explicit AssertFailure(const std::string& what_arg) : ov::Exception(what_arg) {} @@ -71,6 +72,7 @@ class OPENVINO_API AssertFailure : public Exception { class OPENVINO_API NotImplemented : public AssertFailure { public: [[noreturn]] static void create(const char* file, int line, const std::string& explanation); + virtual ~NotImplemented(); static const std::string default_msg; diff --git a/src/core/include/openvino/op/result.hpp b/src/core/include/openvino/op/result.hpp index 00e805d1f2aeb5..9cad2d9444a267 100644 --- a/src/core/include/openvino/op/result.hpp +++ b/src/core/include/openvino/op/result.hpp @@ -13,6 +13,51 @@ namespace v0 { /// \brief Result operation. /// /// \ingroup ov_ops_cpp_api +/// +/// The Result output tensor is special, it shares tensor with Result's input but requires to have dedicated properties +/// like: +/// - tensor names. +/// +/// Setting/adding Result's output names modify this specific tensor names. +/// Result's specific tensor names are added to input descriptor and transferred to new descriptor if Result's input +/// has been replaced. +/// +/// Examples 1: No specific names on Result's output +/// +/// set output names: +/// [N1] +/// ↓ +/// |----------------| [names: N1] |-----------------| +/// | Node |--------------------------->| Result | -> Model output names: N1 +/// |----------------| |-----------------| +/// +/// +/// Examples 2: Result's has got specific names +/// +/// set output names: set output names: +/// [N1] [R1, R2] +/// ↓ ↓ +/// |----------------| [names: N1, R1, R2] |-----------------| +/// | Node |--------------------------->| Result | -> Model output names: R1, R2 +/// |----------------| |-----------------| +/// +/// +/// Examples 3: Result from example 2 connected to new node +/// +/// set output names: set output names: +/// [N2] [R1, R2] +/// ↓ ↓ +/// |----------------| [names: N2, R1, R2] |-----------------| +/// | Node |--------------------------->| Result | -> Model output names: R1, R2 +/// |----------------| |-----------------| +/// +/// set output names: +/// [N1] +/// ↓ +/// |----------------| [names: N1] +/// | Node |-----------------> +/// |----------------| +/// class OPENVINO_API Result : public Op { public: OPENVINO_OP("Result", "opset1"); diff --git a/src/core/include/openvino/pass/pattern/matcher.hpp b/src/core/include/openvino/pass/pattern/matcher.hpp index bbd7e32b0a1802..7112ac9ff85e64 100644 --- a/src/core/include/openvino/pass/pattern/matcher.hpp +++ b/src/core/include/openvino/pass/pattern/matcher.hpp @@ -62,10 +62,31 @@ class OPENVINO_API Matcher { // Avoid implicit string construction from nullptr. Matcher(const std::shared_ptr pattern_node, std::nullptr_t name) = delete; - Matcher() = default; - Matcher(Output& pattern_node) : m_pattern_node{pattern_node} {} - - Matcher(Output& pattern_node, const std::string& name) : m_pattern_node(pattern_node), m_name{name} {} + Matcher() + : m_match_root{}, + m_pattern_node{}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{""}, + m_strict_mode{false} {} + Matcher(Output& pattern_node) + : m_match_root{}, + m_pattern_node{pattern_node}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{""}, + m_strict_mode{false} {} + + Matcher(Output& pattern_node, const std::string& name) + : m_match_root{}, + m_pattern_node{pattern_node}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{name}, + m_strict_mode{false} {} /// \brief Constructs a Matcher object /// @@ -73,9 +94,13 @@ class OPENVINO_API Matcher { /// \param name is a string which is used for logging and disabling a matcher /// \param strict_mode forces a matcher to consider shapes and ET of nodes Matcher(const Output& pattern_node, const std::string& name, bool strict_mode) - : m_pattern_node(pattern_node), - m_name(name), - m_strict_mode(strict_mode) {} + : m_match_root{}, + m_pattern_node{pattern_node}, + m_pattern_map{}, + m_pattern_value_maps{}, + m_matched_list{}, + m_name{name}, + m_strict_mode{strict_mode} {} // Some matches should start on a node rather than an output. These three constructors // are transition until we work out the right way to do that. diff --git a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp index 62dfe01ec4ef1d..4861ef4f7d999d 100644 --- a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp +++ b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp @@ -64,7 +64,13 @@ class RegistersPool { } void release() { if (auto pool = regPool.lock()) { - pool->return_to_pool(reg); + try { + pool->return_to_pool(reg); + } catch (...) { + // This function is called by destructor and should not throw. Well formed Reg object won't cause + // any exception throw from return_to_pool, while on badly formed object the destructor is most + // likely called during exception stack unwind. + } regPool.reset(); } } @@ -90,8 +96,10 @@ class RegistersPool { RegistersPool::WeakPtr regPool; }; + static thread_local bool is_created; + virtual ~RegistersPool() { - check_unique_and_update(false); + is_created = false; } template @@ -178,7 +186,7 @@ class RegistersPool { } } - void check_unique_and_update(bool isCtor = true); + void check_unique_and_update(); PhysicalSet m_general_set; PhysicalSet m_simd_set; diff --git a/src/core/reference/src/utils/registers_pool.cpp b/src/core/reference/src/utils/registers_pool.cpp index 413fdcc3ed83cf..a1e6462aa51a36 100644 --- a/src/core/reference/src/utils/registers_pool.cpp +++ b/src/core/reference/src/utils/registers_pool.cpp @@ -34,16 +34,12 @@ RegistersPool::RegistersPool(std::initializer_list regsToExclude, in m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RSP)); } -void RegistersPool::check_unique_and_update(bool is_ctor) { - static thread_local bool is_created = false; - if (is_ctor) { - if (is_created) { - OPENVINO_THROW("There should be only one instance of RegistersPool per thread"); - } - is_created = true; - } else { - is_created = false; - } +thread_local bool RegistersPool::is_created = false; + +void RegistersPool::check_unique_and_update() { + OPENVINO_ASSERT(!is_created, "There should be only one instance of RegistersPool per thread"); + + is_created = true; } void RegistersPool::PhysicalSet::set_as_used(size_t reg_idx) { diff --git a/src/core/shape_inference/include/glu_shape_inference.hpp b/src/core/shape_inference/include/glu_shape_inference.hpp new file mode 100644 index 00000000000000..365b57244036a2 --- /dev/null +++ b/src/core/shape_inference/include/glu_shape_inference.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ov_ops/glu.hpp" +#include "utils.hpp" +#include "variadic_split_shape_inference.hpp" + +namespace ov { +namespace op { +namespace internal { +template > +std::vector shape_infer(const GLU* op, const std::vector& input_shapes) { + const auto inputs_count = input_shapes.size(); + NODE_SHAPE_INFER_CHECK(op, input_shapes, inputs_count == 1); + + int64_t axis = op->get_axis(); + std::vector split_lengths = {op->get_split_lengths(), -1}; + std::unordered_map const_data; + const_data.emplace(1, ov::Tensor(ov::element::i64, ov::Shape{}, &axis)); + const_data.emplace(2, ov::Tensor(ov::element::i64, ov::Shape{split_lengths.size()}, split_lengths.data())); + + const ov::Shape split_len_size{split_lengths.size()}; + const ov::Shape scalar{}; + std::vector variadic_split_input_shapes{input_shapes[0], scalar, split_len_size}; + + return {std::move( + ov::op::variadic_split::shape_infer(op, variadic_split_input_shapes, ov::make_tensor_accessor(const_data))[0])}; +} +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/core/shape_inference/include/ov_optional.hpp b/src/core/shape_inference/include/ov_optional.hpp index f7f8b474f9a5a6..15973ae0c8a5f8 100644 --- a/src/core/shape_inference/include/ov_optional.hpp +++ b/src/core/shape_inference/include/ov_optional.hpp @@ -7,6 +7,9 @@ #include namespace ov { +#ifdef OPENVINO_CPP_17_VER +using optional = std::optional; +#else /** * @brief Store optional object of type T (basic version of std::optional). @@ -132,4 +135,5 @@ class optional { bool m_has_value = false; Storage m_opt{}; }; +#endif } // namespace ov diff --git a/src/core/shape_inference/include/variadic_split_shape_inference.hpp b/src/core/shape_inference/include/variadic_split_shape_inference.hpp index a0eff51f238e61..e0cd837003a331 100644 --- a/src/core/shape_inference/include/variadic_split_shape_inference.hpp +++ b/src/core/shape_inference/include/variadic_split_shape_inference.hpp @@ -10,10 +10,9 @@ namespace ov { namespace op { -namespace v1 { - +namespace variadic_split { template > -std::vector shape_infer(const VariadicSplit* op, +std::vector shape_infer(const Node* op, const std::vector& input_shapes, const ITensorAccessor& ta = make_tensor_accessor()) { constexpr bool is_dynamic_shape = std::is_base_of::value; @@ -120,6 +119,15 @@ std::vector shape_infer(const VariadicSplit* op, } return output_shapes; } +} // namespace variadic_split + +namespace v1 { +template > +std::vector shape_infer(const VariadicSplit* op, + const std::vector& input_shapes, + const ITensorAccessor& ta = make_tensor_accessor()) { + return op::variadic_split::shape_infer(op, input_shapes, ta); +} } // namespace v1 } // namespace op diff --git a/src/core/src/any.cpp b/src/core/src/any.cpp index 82dc01c99377fd..346819eced93e5 100644 --- a/src/core/src/any.cpp +++ b/src/core/src/any.cpp @@ -6,6 +6,17 @@ #include #include +namespace { +template +bool contains_type_index(Container&& types, const std::type_info& user_type) { + for (auto&& type : types) { + if (ov::util::equal(type, user_type)) { + return true; + } + } + return false; +} +} // namespace namespace ov { @@ -68,6 +79,48 @@ void Any::Base::read_to(Base& other) const { } } +bool Any::Base::is_base_type_info(const std::type_info& user_type) const { + return contains_type_index(base_type_info(), user_type); +} + +bool Any::Base::is_signed_integral() const { + return std::is_signed::value ? contains_type_index(std::initializer_list{typeid(char), + typeid(signed char), + typeid(short), + typeid(int), + typeid(long), + typeid(long long)}, + type_info()) + : contains_type_index(std::initializer_list{typeid(signed char), + typeid(short), + typeid(int), + typeid(long), + typeid(long long)}, + type_info()); +} + +bool Any::Base::is_unsigned_integral() const { + return std::is_signed::value + ? contains_type_index(std::initializer_list{typeid(unsigned char), + typeid(unsigned short), + typeid(unsigned int), + typeid(unsigned long), + typeid(unsigned long long)}, + type_info()) + : contains_type_index(std::initializer_list{typeid(char), + typeid(unsigned char), + typeid(unsigned short), + typeid(unsigned int), + typeid(unsigned long), + typeid(unsigned long long)}, + type_info()); +} +bool Any::Base::is_floating_point() const { + return contains_type_index( + std::initializer_list{typeid(float), typeid(double), typeid(long double)}, + type_info()); +} + Any::~Any() { _temp = {}; _impl = {}; @@ -293,4 +346,42 @@ void Write::operator()(std::ostream& os, const Any& any) const { } } // namespace util + +template +[[noreturn]] U Any::Base::convert_impl() const { + OPENVINO_THROW("Bad cast from: ", type_info().name(), " to: ", typeid(U).name()); +} + +template +U Any::Base::convert_impl() const { + return is() ? static_cast(as()) : convert_impl(); +} + +template <> +long long Any::Base::convert() const { + return std::is_signed::value ? convert_impl() + : convert_impl(); +} + +template <> +unsigned long long Any::Base::convert() const { + return std::is_signed::value ? convert_impl() + : convert_impl(); +} + +template <> +double Any::Base::convert() const { + return convert_impl(); +} } // namespace ov diff --git a/src/core/src/descriptor/input.cpp b/src/core/src/descriptor/input.cpp index 544abd10945806..28288002780739 100644 --- a/src/core/src/descriptor/input.cpp +++ b/src/core/src/descriptor/input.cpp @@ -71,14 +71,6 @@ ov::descriptor::Tensor& ov::descriptor::Input::get_tensor() { return m_output->get_tensor(); } -std::shared_ptr ov::descriptor::Input::get_tensor_ptr() const { - return m_output->get_tensor_ptr(); -} - -std::shared_ptr ov::descriptor::Input::get_tensor_ptr() { - return m_output->get_tensor_ptr(); -} - const ov::Shape& ov::descriptor::Input::get_shape() const { return m_output->get_shape(); } diff --git a/src/core/src/descriptor/shared_tensor.cpp b/src/core/src/descriptor/shared_tensor.cpp new file mode 100644 index 00000000000000..314aa524bcacec --- /dev/null +++ b/src/core/src/descriptor/shared_tensor.cpp @@ -0,0 +1,125 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/core/descriptor/output.hpp" +#include "openvino/core/descriptor_tensor.hpp" +#include "openvino/util/common_util.hpp" + +namespace ov { +namespace descriptor { +/** + * @brief Dedicated tensor descriptor implementation to share input descriptor. + * + * Shared tensor share input tensor but have specific properties: + * - tensor names - if set these are used as descriptor names and appended to input tensor because is same tensor + */ +class SharedTensor : public ITensorDescriptor { +public: + SharedTensor(std::shared_ptr tensor) + : m_shared_tensor{std::move(tensor)}, + m_output_names{}, + m_name_it{} { + OPENVINO_ASSERT(m_shared_tensor, "Cannot set NULL tensor descriptor"); + } + + // --- ITensorDescriptor API + virtual const element::Type& get_element_type() const override { + return m_shared_tensor->get_element_type(); + } + + virtual const PartialShape& get_partial_shape() const override { + return m_shared_tensor->get_partial_shape(); + } + + virtual const Shape& get_shape() const override { + return m_shared_tensor->get_shape(); + } + + virtual void set_type_shape(const element::Type& et, const PartialShape& shape) override { + m_shared_tensor->set_type_shape(et, shape); + } + + void set_names(const std::unordered_set& names) override { + rm_tensor_output_names(); + m_output_names = names; + m_name_it = std::min_element(m_output_names.begin(), m_output_names.end()); + m_shared_tensor->add_names(m_output_names); + } + + void add_names(const std::unordered_set& names) override { + m_output_names.insert(names.begin(), names.end()); + m_name_it = std::min_element(m_output_names.begin(), m_output_names.end()); + m_shared_tensor->add_names(names); + } + + const std::unordered_set& get_names() const override { + return m_output_names.empty() ? m_shared_tensor->get_names() : m_output_names; + } + + const std::unordered_set& get_all_names() const override { + return m_shared_tensor->get_names(); + } + + const std::string& get_any_name() const override { + return m_output_names.empty() ? m_shared_tensor->get_any_name() : *m_name_it; + } + + const RTMap& rt_map() const override { + return m_shared_tensor->rt_map(); + } + + RTMap& rt_map() override { + return m_shared_tensor->rt_map(); + } + + size_t pointer_hash() const noexcept override { + return m_shared_tensor->pointer_hash(); + } + + // --- SharedTensor specific interface + void set_tensor(std::shared_ptr tensor) { + if (tensor != m_shared_tensor) { + OPENVINO_ASSERT(tensor, "Cannot set NULL tensor descriptor"); + rm_tensor_output_names(); + auto prev_rt_map = rt_map(); + + m_shared_tensor = std::move(tensor); + m_shared_tensor->add_names(m_output_names); + rt_map().insert(std::make_move_iterator(prev_rt_map.begin()), std::make_move_iterator(prev_rt_map.end())); + } + } + +private: + void rm_tensor_output_names() { + auto names = m_shared_tensor->get_names(); + for (const auto& output_name : m_output_names) { + names.erase(output_name); + } + + m_shared_tensor->set_names(names); + } + + std::shared_ptr m_shared_tensor; + std::unordered_set m_output_names; + std::unordered_set::const_iterator m_name_it; +}; + +/** + * @brief Set output tensor descriptor with shared tensor from new input. + * + * @param output Output descriptor to be updated. + * @param input Input descriptor to set as shared tensor. + */ +void set_shared_tensor(Output& output, const Input& input) { + auto& output_descriptor = TensorExtension::get_descriptor_ptr(output.get_tensor()); + const auto& input_descriptor = TensorExtension::get_descriptor_ptr(input.get_output().get_tensor()); + if (auto* result_ptr = dynamic_cast(output_descriptor.get())) { + result_ptr->set_tensor(input_descriptor); + } else { + output_descriptor = std::make_shared(input_descriptor); + } +} + +} // namespace descriptor +} // namespace ov diff --git a/src/core/src/descriptor/tensor.cpp b/src/core/src/descriptor/tensor.cpp index ae3f7c6e77cd4f..6e85b25b2b9f8a 100644 --- a/src/core/src/descriptor/tensor.cpp +++ b/src/core/src/descriptor/tensor.cpp @@ -8,26 +8,153 @@ #include "openvino/core/descriptor_tensor.hpp" #include "openvino/core/except.hpp" #include "openvino/core/node.hpp" +#include "openvino/core/type/element_iterator.hpp" #include "openvino/op/util/symbolic_info.hpp" +#include "openvino/util/common_util.hpp" -ov::descriptor::Tensor::Tensor(const element::Type& element_type, - const PartialShape& pshape, - const std::unordered_set& names) - : m_element_type(element_type), - m_partial_shape(pshape) { - set_names(names); +namespace ov { +namespace descriptor { + +/** @brief Helper class to store Tensor shape information.*/ +class ShapeInfo { +public: + ShapeInfo() = default; + ShapeInfo(const PartialShape& shape) : m_partial_shape{shape} {} + + void set_partial_shape(PartialShape shape) { + AtomicGuard lock(m_shape_changing); + m_partial_shape = std::move(shape); + m_shape_changed = true; + } + + const PartialShape& get_partial_shape() const { + return m_partial_shape; + } + + const Shape& get_shape() const { + AtomicGuard lock(m_shape_changing); + if (m_shape_changed) { + m_shape = m_partial_shape.to_shape(); + m_shape_changed = false; + } + return m_shape; + } + +private: + PartialShape m_partial_shape{}; + mutable Shape m_shape{}; + mutable std::atomic m_shape_changing{false}; + mutable bool m_shape_changed{true}; +}; + +// --- Tensor descriptor interface +ITensorDescriptor::~ITensorDescriptor() = default; + +/** @brief Basic tensor descriptor. */ +class BasicTensor : public ITensorDescriptor { +public: + BasicTensor() = default; + + BasicTensor(const element::Type& et, const PartialShape& shape, const std::unordered_set& names) + : m_element_type{et}, + m_shape_info{shape}, + m_names{names}, + m_name_it{find_new_any_name(m_names)}, + m_rt_map{}, + m_legacy_name{} {} + + virtual const element::Type& get_element_type() const override { + return m_element_type; + } + + virtual const PartialShape& get_partial_shape() const override { + return m_shape_info.get_partial_shape(); + } + + virtual const Shape& get_shape() const override { + return m_shape_info.get_shape(); + } + + virtual void set_type_shape(const element::Type& et, const PartialShape& shape) override { + m_element_type = et; + m_shape_info.set_partial_shape(shape); + } + + void set_names(const std::unordered_set& names) override { + m_names = names; + m_name_it = find_new_any_name(m_names); + }; + + void add_names(const std::unordered_set& names) override { + m_names.insert(names.begin(), names.end()); + m_name_it = find_new_any_name(m_names); + } + + const std::unordered_set& get_names() const override { + return m_names; + } + + const std::unordered_set& get_all_names() const override { + return get_names(); + } + + const std::string& get_any_name() const override { + OPENVINO_ASSERT(!get_names().empty(), "Attempt to get a name for a Tensor without names"); + return *m_name_it; + } + + const RTMap& rt_map() const override { + return m_rt_map; + } + + RTMap& rt_map() override { + return m_rt_map; + }; + + size_t pointer_hash() const noexcept override { + return std::hash()(this); + } + +private: + element::Type m_element_type; + ShapeInfo m_shape_info; + std::unordered_set m_names; + std::unordered_set::const_iterator m_name_it; + RTMap m_rt_map; + std::string m_legacy_name; + + static decltype(m_name_it) find_new_any_name(const decltype(m_names)& names) { + return std::min_element(names.begin(), names.end()); + } +}; + +// --- TensorExtension +const ITensorDescriptor& TensorExtension::get_descriptor(const Tensor& tensor) { + return *tensor.m_impl; } -ov::descriptor::Tensor::Tensor(const element::Type& element_type, - const PartialShape& pshape, - ov::Node* node, - size_t node_output_number) - : m_element_type(element_type), - m_partial_shape(pshape) { - m_name_it = m_names.cend(); +std::shared_ptr& TensorExtension::get_descriptor_ptr(Tensor& tensor) { + return tensor.m_impl; } -void ov::descriptor::Tensor::invalidate_values() { +bool TensorExtension::Equal::operator()(const std::shared_ptr& lhs, const std::shared_ptr& rhs) const { + return TensorExtension::get_descriptor(*lhs).pointer_hash() == TensorExtension::get_descriptor(*rhs).pointer_hash(); +} + +size_t TensorExtension::Hasher::operator()(const std::shared_ptr& tensor) const { + return get_descriptor(*tensor).pointer_hash(); +} + +// --- Tensor +Tensor::Tensor(const element::Type& element_type, + const PartialShape& pshape, + const std::unordered_set& names) + : m_impl(std::make_shared(element_type, pshape, names)) {} + +Tensor::Tensor(const element::Type& element_type, const PartialShape& pshape, ov::Node* node, size_t) + : m_impl(std::make_shared(element_type, pshape, std::unordered_set{})) {} + +void Tensor::invalidate_values() { if (ov::skip_invalidation(*this)) return; m_upper_value = {}; @@ -35,110 +162,110 @@ void ov::descriptor::Tensor::invalidate_values() { m_value_symbol.clear(); } -void ov::descriptor::Tensor::set_lower_value(const ov::Tensor& value) { +void Tensor::set_lower_value(const ov::Tensor& value) { OPENVINO_ASSERT(static_cast(value)); - OPENVINO_ASSERT(m_partial_shape.same_scheme(value.get_shape())); - OPENVINO_ASSERT(m_element_type == value.get_element_type()); + OPENVINO_ASSERT(get_partial_shape().same_scheme(value.get_shape())); + OPENVINO_ASSERT(get_element_type() == value.get_element_type()); m_lower_value = value; } -void ov::descriptor::Tensor::set_upper_value(const ov::Tensor& value) { +void Tensor::set_upper_value(const ov::Tensor& value) { OPENVINO_ASSERT(static_cast(value)); - OPENVINO_ASSERT(m_partial_shape.same_scheme(value.get_shape())); - OPENVINO_ASSERT(m_element_type == value.get_element_type()); + OPENVINO_ASSERT(get_partial_shape().same_scheme(value.get_shape())); + OPENVINO_ASSERT(get_element_type() == value.get_element_type()); m_upper_value = value; } -void ov::descriptor::Tensor::set_value_symbol(const TensorSymbol& value_symbol) { +void Tensor::set_value_symbol(const TensorSymbol& value_symbol) { const auto& symbols_size = value_symbol.size(); if (symbols_size == 0) { m_value_symbol.clear(); } else { - OPENVINO_ASSERT(m_partial_shape.is_static()); - OPENVINO_ASSERT(shape_size(m_partial_shape.to_shape()) == symbols_size); + OPENVINO_ASSERT(get_partial_shape().is_static()); + OPENVINO_ASSERT(shape_size(get_partial_shape().to_shape()) == symbols_size); m_value_symbol = value_symbol; } } -const ov::Shape& ov::descriptor::Tensor::get_shape() const { - AtomicGuard lock(m_shape_changing); - if (m_shape_changed) { - m_shape = m_partial_shape.to_shape(); - m_shape_changed = false; - } - return m_shape; +const ov::Tensor& Tensor::get_lower_value() const { + return m_lower_value; } -size_t ov::descriptor::Tensor::size() const { - const bool bitwidth_less_than_byte = m_element_type.bitwidth() < 8; - return bitwidth_less_than_byte ? (shape_size(get_shape()) * m_element_type.bitwidth() + 7) >> 3 - : (shape_size(get_shape()) * m_element_type.size()); +const ov::Tensor& Tensor::get_upper_value() const { + return m_upper_value; } -const std::unordered_set& ov::descriptor::Tensor::get_names() const { - return m_names; +TensorSymbol Tensor::get_value_symbol() const { + return m_value_symbol; } -const std::string& ov::descriptor::Tensor::get_any_name() const { - if (m_name_it == m_names.cend()) { - OPENVINO_THROW("Attempt to get a name for a Tensor without names"); - } - return *m_name_it; +bool Tensor::has_and_set_bound() const { + return m_upper_value && m_lower_value && m_upper_value.data() == m_lower_value.data(); } -void ov::descriptor::Tensor::set_names(const std::unordered_set& names) { - m_names = names; - m_name_it = m_names.cbegin(); - for (auto it = m_names.cbegin(); it != m_names.cend(); it++) { - if (*it < *m_name_it) - // Update any name - m_name_it = it; - } +const element::Type& Tensor::get_element_type() const { + return m_impl->get_element_type(); } -void ov::descriptor::Tensor::add_names(const std::unordered_set& names) { - for (const auto& name : names) { - auto res = m_names.insert(name); - if (m_name_it == m_names.end() || *res.first < *m_name_it) - // Update any name - m_name_it = res.first; - } +const PartialShape& Tensor::get_partial_shape() const { + return m_impl->get_partial_shape(); +} +const Shape& Tensor::get_shape() const { + return m_impl->get_shape(); } -void ov::descriptor::Tensor::clone_from(const ov::descriptor::Tensor& old) { - { - AtomicGuard lock(m_shape_changing); - m_partial_shape = old.get_partial_shape(); - m_shape_changed = true; - } - set_names(old.get_names()); - m_element_type = old.get_element_type(); - m_lower_value = old.get_lower_value(); - m_upper_value = old.get_upper_value(); - m_value_symbol = old.get_value_symbol(); - m_rt_info = old.get_rt_info(); +size_t Tensor::size() const { + return element::get_memory_size(get_element_type(), shape_size(get_shape())); } -void ov::descriptor::set_tensor_type(ov::descriptor::Tensor& tensor, - const element::Type& element_type, - const PartialShape& pshape) { - tensor.m_element_type = element_type; - AtomicGuard lock(tensor.m_shape_changing); - tensor.m_partial_shape = pshape; - tensor.m_shape_changed = true; +const std::unordered_set& Tensor::get_names() const { + return m_impl->get_names(); } -void ov::descriptor::set_element_type(ov::descriptor::Tensor& tensor, const element::Type& element_type) { - tensor.m_element_type = element_type; +const RTMap& Tensor::get_rt_info() const { + return m_impl->rt_map(); } -std::ostream& ov::descriptor::operator<<(std::ostream& out, const ov::descriptor::Tensor& tensor) { - std::string names; - for (const auto& name : tensor.get_names()) { - if (!names.empty()) - names += ", "; - names += name; - } - out << "Tensor(" << names << ")"; +RTMap& Tensor::get_rt_info() { + return m_impl->rt_map(); +} + +const std::string& Tensor::get_any_name() const { + return m_impl->get_any_name(); +} + +void Tensor::set_names(const std::unordered_set& names) { + m_impl->set_names(names); +} + +void Tensor::add_names(const std::unordered_set& names) { + m_impl->add_names(names); +} + +void Tensor::clone_from(const Tensor& other) { + m_impl->set_type_shape(other.get_element_type(), other.get_partial_shape()); + set_names(other.get_names()); + m_lower_value = other.get_lower_value(); + m_upper_value = other.get_upper_value(); + m_value_symbol = other.get_value_symbol(); + get_rt_info() = other.get_rt_info(); +} + +void set_tensor_type(Tensor& tensor, const element::Type& element_type, const PartialShape& pshape) { + TensorExtension::get_descriptor_ptr(tensor)->set_type_shape(element_type, pshape); +} + +void set_element_type(Tensor& tensor, const element::Type& element_type) { + TensorExtension::get_descriptor_ptr(tensor)->set_type_shape(element_type, tensor.get_partial_shape()); +} + +void copy_tensor_names(Tensor& dst, const Tensor& src) { + dst.set_names(TensorExtension::get_descriptor(src).get_all_names()); +} + +std::ostream& operator<<(std::ostream& out, const Tensor& tensor) { + out << "Tensor(" << util::join(tensor.get_names()) << ")"; return out; } +} // namespace descriptor +} // namespace ov diff --git a/src/core/src/except.cpp b/src/core/src/except.cpp index 6ce0568e04e387..7cddc5b3ec4a52 100644 --- a/src/core/src/except.cpp +++ b/src/core/src/except.cpp @@ -45,8 +45,12 @@ void ov::AssertFailure::create(const char* file, throw ov::AssertFailure(make_what(file, line, check_string, context_info, explanation)); } +ov::AssertFailure::~AssertFailure() = default; + void ov::NotImplemented::create(const char* file, int line, const std::string& explanation) { throw ov::NotImplemented(make_what(file, line, nullptr, default_msg, explanation)); } +ov::NotImplemented::~NotImplemented() = default; + const std::string ov::NotImplemented::default_msg{"Not Implemented"}; diff --git a/src/core/src/node.cpp b/src/core/src/node.cpp index ec9197a5a337cb..689e1c80af12a0 100644 --- a/src/core/src/node.cpp +++ b/src/core/src/node.cpp @@ -155,8 +155,8 @@ std::shared_ptr ov::Node::copy_with_new_inputs( for (auto& cdep : control_dependencies) { clone->add_control_dependency(cdep); } - for (size_t i = 0; i < get_output_size(); i++) { - clone->get_output_tensor(i).set_names(get_output_tensor(i).get_names()); + for (size_t i = 0; i < get_output_size(); ++i) { + descriptor::copy_tensor_names(clone->get_output_tensor(i), get_output_tensor(i)); } return clone; } @@ -218,9 +218,8 @@ ov::descriptor::Input& ov::Node::get_input_descriptor(size_t position) { ov::descriptor::Output& ov::Node::get_output_descriptor(size_t position) { while (m_outputs.size() <= position) { - size_t i = m_outputs.size(); - auto tensor_descriptor = make_shared(element::dynamic, PartialShape::dynamic(), this, i); - m_outputs.emplace_back(this, i, tensor_descriptor); + const auto i = m_outputs.size(); + m_outputs.emplace_back(this, i, make_shared(element::dynamic, PartialShape::dynamic())); } return m_outputs[position]; } @@ -468,8 +467,8 @@ ov::descriptor::Tensor& ov::Node::get_output_tensor(size_t i) const { ov::descriptor::Tensor& ov::Node::get_input_tensor(size_t i) const { OPENVINO_ASSERT(i < m_inputs.size(), idx_txt, i, out_of_range_txt); - descriptor::Input input = m_inputs[i]; - return input.get_tensor(); + auto& input = m_inputs[i]; + return input.get_output().get_tensor(); } size_t ov::Node::get_input_size() const { diff --git a/src/core/src/op/result.cpp b/src/core/src/op/result.cpp index 237d6bd7a2084a..97dc95a0e53f17 100644 --- a/src/core/src/op/result.cpp +++ b/src/core/src/op/result.cpp @@ -9,6 +9,7 @@ #include #include "itt.hpp" +#include "openvino/core/descriptor_tensor.hpp" namespace ov { namespace op { @@ -22,10 +23,8 @@ void Result::validate_and_infer_types() { OV_OP_SCOPE(v0_Result_validate_and_infer_types); NODE_VALIDATION_CHECK(this, get_input_size() == 1, "Argument has ", get_input_size(), " outputs (1 expected)."); - // Result doesn't change change in/out tensors - auto& output = get_output_descriptor(0); - auto& input = get_input_descriptor(0); - output.set_tensor_ptr(input.get_tensor_ptr()); + // Result shares input tensor but can have specific properties which are added/removed to input. + descriptor::set_shared_tensor(get_output_descriptor(0), get_input_descriptor(0)); } std::shared_ptr Result::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/pass/manager.cpp b/src/core/src/pass/manager.cpp index 9168292f5284c0..a6f1fc287e221c 100644 --- a/src/core/src/pass/manager.cpp +++ b/src/core/src/pass/manager.cpp @@ -104,8 +104,8 @@ class stopwatch { void stop() { if (m_active) { - auto end_time = m_clock.now(); - m_last_time = end_time - m_start_time; + m_end_time = m_clock.now(); + m_last_time = m_end_time - m_start_time; m_active = false; } } @@ -122,9 +122,17 @@ class stopwatch { return std::chrono::duration_cast(get_timer_value()).count(); } + std::chrono::nanoseconds get_start_time() const { + return std::chrono::duration_cast(m_start_time.time_since_epoch()); + } + + std::chrono::nanoseconds get_end_time() const { + return std::chrono::duration_cast(m_end_time.time_since_epoch()); + } + private: std::chrono::high_resolution_clock m_clock; - std::chrono::time_point m_start_time; + std::chrono::time_point m_start_time, m_end_time; bool m_active = false; std::chrono::nanoseconds m_last_time = std::chrono::high_resolution_clock::duration::zero(); }; @@ -221,6 +229,8 @@ class Profiler { if (is_pass_manager) { m_file << "m;" << name << ";" << stopwatch.get_timer_value().count() << ";" << (applied ? "1" : "0") << std::endl; + m_file << "m_start;" << name << ";" << stopwatch.get_start_time().count() << std::endl; + m_file << "m_end;" << name << ";" << stopwatch.get_end_time().count() << std::endl; } else { m_file << "t;" << name << ";" << m_manager_name << ";" << stopwatch.get_timer_value().count() << ";" << (applied ? "1" : "0") << std::endl; diff --git a/src/core/src/preprocess/preprocess_impls.cpp b/src/core/src/preprocess/preprocess_impls.cpp index cbe18a78beb575..c2523beed66620 100644 --- a/src/core/src/preprocess/preprocess_impls.cpp +++ b/src/core/src/preprocess/preprocess_impls.cpp @@ -6,6 +6,7 @@ #include "layout_utils.hpp" #include "openvino/core/descriptor_tensor.hpp" +#include "openvino/util/common_util.hpp" namespace ov { namespace preprocess { @@ -325,11 +326,9 @@ void InputInfo::InputInfoImpl::dump(std::ostream& str, //----------- OutputInfoImpl ---------- void OutputInfo::OutputInfoImpl::build(ov::ResultVector& results) { - std::shared_ptr result; auto node = m_output_node; - const auto start_out_node_names = node.get_tensor().get_names(); - node.get_tensor().set_names({}); - result = std::dynamic_pointer_cast(node.get_node_shared_ptr()); + const auto result = ov::as_type_ptr(node.get_node_shared_ptr()); + // Set result layout from 'model' information if (get_model_data()->is_layout_set()) { // Overwrite existing model's layout here (fix 74065) @@ -369,49 +368,36 @@ void OutputInfo::OutputInfoImpl::build(ov::ResultVector& results) { node = std::get<0>(action_result); post_processing_applied = true; } - // Restore tensor names - node.get_tensor().set_names(start_out_node_names); + auto orig_parent = result->get_input_source_output(0).get_node_shared_ptr(); - bool reset_orig_friendly_name = false; + // Move result tensor names from previous input to new + const auto result_input_names = result->get_input_tensor(0).get_names(); + result->get_input_tensor(0).set_names({}); + node.get_tensor().set_names(result_input_names); + if (!post_processing_applied) { return; } + if (orig_parent->get_output_size() == 1) { node.get_node_shared_ptr()->set_friendly_name(orig_parent->get_friendly_name()); - reset_orig_friendly_name = true; + + // Reset friendly name of input node to avoid names collision + // when there is at a new node inserted by post-processing steps + // If no new nodes are inserted by post-processing, then we need to preserve friendly name of input + // as it's required for old API correct work + result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name(""); } else if (node.get_node_shared_ptr() != orig_parent) { // Result node is changed - add "." suffix node.get_node_shared_ptr()->set_friendly_name(orig_parent->get_friendly_name() + "." + std::to_string(result->get_input_source_output(0).get_index())); } - - // Reset friendly name of input node to avoid names collision - // when there is at a new node inserted by post-processing steps - // If no new nodes are inserted by post-processing, then we need to preserve friendly name of input - // as it's required for old API correct work - if (reset_orig_friendly_name) { - result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name(""); - } - - // Create result - auto new_result = std::make_shared(node); - new_result->set_friendly_name(result->get_friendly_name()); - - // Preserve runtime info of original result - new_result->get_rt_info() = result->get_rt_info(); - new_result->input(0).get_rt_info() = result->input(0).get_rt_info(); - new_result->output(0).get_rt_info() = result->output(0).get_rt_info(); + result->input(0).replace_source_output(node); + result->revalidate_and_infer_types(); // Update layout if (!context.layout().empty()) { - new_result->set_layout(context.layout()); - } - - for (auto& old_result : results) { - if (result == old_result) { - old_result = new_result; - break; - } + result->set_layout(context.layout()); } } @@ -439,7 +425,7 @@ void OutputInfo::OutputInfoImpl::dump(std::ostream& str) const { str << "Output "; if (!start_out_node_names.empty()) { - str << "\"" << *start_out_node_names.begin() << "\""; + str << "\"" << util::join(start_out_node_names) << "\""; } str << ":" << std::endl; str << " Model's data tensor: "; diff --git a/src/core/tests/any.cpp b/src/core/tests/any.cpp index 3914a617ff2982..33e928d60b872d 100644 --- a/src/core/tests/any.cpp +++ b/src/core/tests/any.cpp @@ -11,7 +11,8 @@ #include "common_test_utils/test_assertions.hpp" #include "openvino/core/runtime_attribute.hpp" -using namespace ov; +namespace ov { +namespace test { class DestructorTest { public: @@ -735,3 +736,70 @@ TEST_F(AnyTests, EmptyStringAsAny) { ASSERT_EQ(p.as>(), ref_f); ASSERT_EQ(p.as>(), ref_i); } + +template +class AnyConversionTest : public AnyTests {}; + +TYPED_TEST_SUITE_P(AnyConversionTest); + +using AnyArithmeticTypes = ::testing::Types; + +TYPED_TEST_P(AnyConversionTest, AnyToOtherValue) { + const TypeParam test_value{static_cast(23.15f)}; + const auto a = Any{test_value}; + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); +} + +REGISTER_TYPED_TEST_SUITE_P(AnyConversionTest, AnyToOtherValue); +INSTANTIATE_TYPED_TEST_SUITE_P(InstantiationName, AnyConversionTest, AnyArithmeticTypes); + +TEST_F(AnyTests, AnyAsOtherTypeIsIncosisoinet) { + // To show member `as` current behaviour. + // Maybe there should be two members `as` which return value + // and `cast` returns reference if casted type is same as Any underlying type + auto a = Any{10}; + + auto& a_int = a.as(); + auto& a_str = a.as(); + + EXPECT_EQ(a_int, 10); + EXPECT_EQ(a_str, "10"); + + a_int = 15; + EXPECT_EQ(a_int, 15); + // as string ref still has old value + EXPECT_EQ(a_str, "10"); + + a_str = "30"; + EXPECT_EQ(a_int, 15); + // as string ref has new value but is not in sync what any contains. + EXPECT_EQ(a_str, "30"); +} + +} // namespace test +} // namespace ov diff --git a/src/core/tests/pattern.cpp b/src/core/tests/pattern.cpp index 050c36b65baad1..982e59b55f0f97 100644 --- a/src/core/tests/pattern.cpp +++ b/src/core/tests/pattern.cpp @@ -558,8 +558,8 @@ TEST(pattern, multiple_optionals_in_row) { // Pattern: auto in = wrap_type(); - auto pattern_convert = optional(in); - auto pattern_relu = optional(pattern_convert); + auto pattern_convert = pattern::optional(in); + auto pattern_relu = pattern::optional(pattern_convert); auto pattern_sigmoid = wrap_type({pattern_relu}); // Test: @@ -1255,4 +1255,4 @@ TEST(pattern, pattern_optional_root) { // Should perfectly match ASSERT_TRUE(tm.match(pattern_relu, model_relu)); -} \ No newline at end of file +} diff --git a/src/core/tests/type_prop/result.cpp b/src/core/tests/type_prop/result.cpp index f0c0eecc285004..9776768df052a0 100644 --- a/src/core/tests/type_prop/result.cpp +++ b/src/core/tests/type_prop/result.cpp @@ -7,8 +7,12 @@ #include "common_test_utils/type_prop.hpp" #include "openvino/op/constant.hpp" -using namespace std; -using namespace ov; +namespace ov { +namespace test { + +using ov::op::v0::Parameter; +using std::make_shared; +using testing::UnorderedElementsAre; TEST(type_prop, result) { const auto arg_shape = Shape{1, 2, 3, 4, 5}; @@ -51,3 +55,101 @@ TEST(type_prop, result_layout_invalid) { result->output(0).get_rt_info()[ov::LayoutAttribute::get_type_info_static()] = "NCHW"; // incorrect way ASSERT_THROW(result->get_layout(), ov::Exception); } + +using TypePropResultV0Test = TypePropOpTest; + +TEST_F(TypePropResultV0Test, set_specific_output_name_by_output) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("input")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("input")); + + result->output(0).set_names({"out"}); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "out")); +} + +TEST_F(TypePropResultV0Test, set_specific_output_name_by_tensor_desc) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("input")); + + result->get_output_tensor(0).set_names({"out"}); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "out")); +} + +TEST_F(TypePropResultV0Test, change_specific_output_name) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("input")); + + result->get_output_tensor(0).set_names({"out"}); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "out")); + + result->output(0).set_names({"new output"}); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("new output")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("new output")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "new output")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), UnorderedElementsAre("input", "new output")); +} + +TEST_F(TypePropResultV0Test, add_specific_output_name) { + auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input"}); + + auto result = make_op(a); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("input")); + + result->output(0).set_names({"out"}); + result->get_output_tensor(0).add_names({"extra output name", "o1"}); + result->output(0).add_names({"extra output name", "o2"}); + + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out", "extra output name", "o1", "o2")); + EXPECT_THAT(result->get_output_tensor(0).get_names(), UnorderedElementsAre("out", "extra output name", "o1", "o2")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input", "out", "extra output name", "o1", "o2")); + EXPECT_THAT(a->get_output_tensor(0).get_names(), + UnorderedElementsAre("input", "out", "extra output name", "o1", "o2")); +} + +TEST_F(TypePropResultV0Test, preserve_specific_name_on_input_replace) { + const auto a = std::make_shared(element::f32, PartialShape::dynamic()); + a->get_output_tensor(0).set_names({"input a"}); + + const auto result = make_op(a); + result->output(0).set_names({"out"}); + + EXPECT_THAT(result->input(0).get_tensor().get_names(), UnorderedElementsAre("out", "input a")); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + + const auto b = std::make_shared(element::f32, PartialShape::dynamic()); + b->get_output_tensor(0).set_names({"input b"}); + + result->input(0).replace_source_output(b); + result->validate_and_infer_types(); + + EXPECT_THAT(result->input(0).get_tensor().get_names(), UnorderedElementsAre("input b", "out")); + EXPECT_THAT(result->output(0).get_names(), UnorderedElementsAre("out")); + EXPECT_THAT(a->output(0).get_names(), UnorderedElementsAre("input a")); +} +} // namespace test +} // namespace ov diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp index 12cd171db03e01..981cdf283d63cd 100644 --- a/src/frontends/ir/src/ir_deserializer.cpp +++ b/src/frontends/ir/src/ir_deserializer.cpp @@ -10,6 +10,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/meta_data.hpp" #include "openvino/core/rt_info/weightless_caching_attributes.hpp" +#include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/loop.hpp" @@ -850,7 +851,9 @@ std::shared_ptr ov::XmlDeserializer::create_node(const std::vector(inputs[i].get_node_shared_ptr()) && + ov::element::Type_t::undefined == inputs[i].get_element_type()) OPENVINO_THROW(params.type, " layer ", params.name, diff --git a/src/frontends/jax/src/op/square.cpp b/src/frontends/jax/src/op/square.cpp new file mode 100644 index 00000000000000..268debb7992ba8 --- /dev/null +++ b/src/frontends/jax/src/op/square.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/jax/node_context.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/power.hpp" +#include "openvino/op/squeeze.hpp" +#include "utils.hpp" + +namespace ov { +namespace frontend { +namespace jax { +namespace op { + +using namespace ov::op; + +OutputVector translate_square(const NodeContext& context) { + num_inputs_check(context, 1, 1); + auto x = context.get_input(0); + auto const_two = create_same_type_const_scalar(x, 2); + return {std::make_shared(x, const_two)}; +}; + +} // namespace op +} // namespace jax +} // namespace frontend +} // namespace ov diff --git a/src/frontends/jax/src/op_table.cpp b/src/frontends/jax/src/op_table.cpp index 98f22452c5afab..3ca58745bc1909 100644 --- a/src/frontends/jax/src/op_table.cpp +++ b/src/frontends/jax/src/op_table.cpp @@ -53,6 +53,7 @@ OP_CONVERTER(translate_reduce_window_sum); OP_CONVERTER(translate_reshape); OP_CONVERTER(translate_rsqrt); OP_CONVERTER(translate_slice); +OP_CONVERTER(translate_square); OP_CONVERTER(translate_squeeze); OP_CONVERTER(translate_transpose); @@ -92,6 +93,7 @@ const std::map get_supported_ops_jaxpr() { {"rsqrt", op::translate_rsqrt}, {"reshape", op::translate_reshape}, {"slice", op::translate_slice}, + {"square", op::translate_square}, {"sqrt", op::translate_1to1_match_1_input}, {"squeeze", op::translate_squeeze}, {"stop_gradient", op::skip_node}, diff --git a/src/frontends/pytorch/src/op/linear.cpp b/src/frontends/pytorch/src/op/linear.cpp index 2d01dee84c151b..4a5ad4a6b0e73b 100644 --- a/src/frontends/pytorch/src/op/linear.cpp +++ b/src/frontends/pytorch/src/op/linear.cpp @@ -5,6 +5,10 @@ #include "openvino/frontend/pytorch/node_context.hpp" #include "openvino/op/add.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/subtract.hpp" #include "utils.hpp" namespace ov { @@ -12,6 +16,8 @@ namespace frontend { namespace pytorch { namespace op { +using namespace ov::op; + OutputVector translate_linear(const NodeContext& context) { // schema: aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor num_inputs_check(context, 2, 3); @@ -20,17 +26,91 @@ OutputVector translate_linear(const NodeContext& context) { if (weight.get_element_type() == element::f16 || weight.get_element_type() == element::bf16) { // In case of patched linear it can have mixed fp16/bf16 and fp32 input type. // In other cases these conversion is not required. - weight = context.mark_node(std::make_shared(weight, x)); + weight = context.mark_node(std::make_shared(weight, x)); } - auto matmul = context.mark_node(std::make_shared(x, weight, false, true)); + auto matmul = context.mark_node(std::make_shared(x, weight, false, true)); if (!context.input_is_none(2)) { auto bias = context.get_input(2); if (bias.get_element_type() == element::f16 || bias.get_element_type() == element::bf16) { // Same reason as for weight. - bias = context.mark_node(std::make_shared(bias, x)); + bias = context.mark_node(std::make_shared(bias, x)); + } + matmul = context.mark_node(std::make_shared(matmul, bias)); + } + return {matmul}; +}; + +namespace { +uint32_t rearrange_awq_bits(uint32_t num) { + uint32_t result = 0; + uint32_t mask = 0xF; + + // Rearrange each 4-bit part in accordance with the AWQ i32->u4 unpacking schema + result |= (num & (mask << 0)) << 0; + result |= (num & (mask << 16)) >> 12; + result |= (num & (mask << 4)) << 4; + result |= (num & (mask << 20)) >> 8; + result |= (num & (mask << 8)) << 8; + result |= (num & (mask << 24)) >> 4; + result |= (num & (mask << 12)) << 12; + result |= (num & (mask << 28)) >> 0; + + return result; +} + +Output rearrange_constant(const Output& c, uint32_t groups) { + auto constant = std::dynamic_pointer_cast(c.get_node_shared_ptr()); + FRONT_END_OP_CONVERSION_CHECK(constant, "weight must be Constant."); + auto src = constant->get_data_ptr(); + auto initial_shape = constant->get_shape(); + FRONT_END_OP_CONVERSION_CHECK(initial_shape.size() == 2, "Only 2D constants are supported."); + auto new_shape = Shape{initial_shape[0] / groups, groups, initial_shape[1] * 8}; + auto new_qweight = std::make_shared(element::u4, new_shape); + auto dst = const_cast(reinterpret_cast(new_qweight->get_data_ptr())); + for (size_t i = 0; i < shape_size(constant->get_shape()); i++) { + dst[i] = rearrange_awq_bits(src[i]); + } + return new_qweight; +} +} // namespace + +OutputVector translate_linear_awq(const NodeContext& context) { + num_inputs_check(context, 4, 7); + auto x = context.get_input(0); + auto qweight = context.get_input(1); + auto qzeros = context.get_input(2); + auto scales = context.get_input(3); + auto groups = context.const_input(4); + auto bits = context.const_input(5); + + FRONT_END_OP_CONVERSION_CHECK(bits == 4, "Only 4 bit AWQ is supported."); + + auto new_qweight = rearrange_constant(qweight, static_cast(groups)); + auto new_qzeros = rearrange_constant(qzeros, 1); + new_qweight = context.mark_node(std::make_shared(new_qweight, scales.get_element_type())); + new_qzeros = context.mark_node(std::make_shared(new_qzeros, scales.get_element_type())); + + auto w_s = context.mark_node(std::make_shared(new_qweight, new_qzeros)); + FRONT_END_OP_CONVERSION_CHECK(scales.get_partial_shape().is_static(), "Scales must be constant."); + auto scales_shape = scales.get_shape(); + auto new_scales_shape = + v0::Constant::create(element::i32, {3}, std::vector{scales_shape[0], 1, scales_shape[1]}); + scales = context.mark_node(std::make_shared(scales, new_scales_shape, false)); + auto weight = context.mark_node(std::make_shared(w_s, scales)); + auto out_shape = + v0::Constant::create(element::i32, {2}, std::vector{static_cast(qweight.get_shape()[0]), -1}); + weight = context.mark_node(std::make_shared(weight, out_shape, false)); + weight = context.mark_node(std::make_shared(weight, x)); + + auto matmul = context.mark_node(std::make_shared(x, weight, false, false)); + if (!context.input_is_none(6)) { + auto bias = context.get_input(6); + + if (bias.get_element_type() == element::f16 || bias.get_element_type() == element::bf16) { + bias = context.mark_node(std::make_shared(bias, x)); } - matmul = context.mark_node(std::make_shared(matmul, bias)); + matmul = context.mark_node(std::make_shared(matmul, bias)); } return {matmul}; }; diff --git a/src/frontends/pytorch/src/op/stft.cpp b/src/frontends/pytorch/src/op/stft.cpp index b7e4858c2f8fcc..d1fe4f9f15828b 100644 --- a/src/frontends/pytorch/src/op/stft.cpp +++ b/src/frontends/pytorch/src/op/stft.cpp @@ -10,6 +10,7 @@ #include "openvino/op/convert_like.hpp" #include "openvino/op/divide.hpp" #include "openvino/op/shape_of.hpp" +#include "openvino/op/sqrt.hpp" #include "openvino/op/unsqueeze.hpp" #include "utils.hpp" @@ -66,8 +67,6 @@ OutputVector translate_stft(const NodeContext& context) { if (!context.input_is_none(5)) { normalized = context.const_input(5); } - PYTORCH_OP_CONVERSION_CHECK(!normalized, - "aten::stft conversion is currently supported with normalized=False only."); bool onesided = true; if (!context.input_is_none(6)) { @@ -85,7 +84,15 @@ OutputVector translate_stft(const NodeContext& context) { // Perform STFT constexpr bool transpose_frames = true; auto stft = context.mark_node(std::make_shared(input, window, n_fft, hop_length, transpose_frames)); - return {stft}; + + if (normalized) { + const auto nfft_convert = context.mark_node(std::make_shared(n_fft, stft)); + const auto divisor = context.mark_node(std::make_shared(nfft_convert)); + const auto norm_stft = context.mark_node(std::make_shared(stft, divisor)); + return {norm_stft}; + } else { + return {stft}; + } }; } // namespace op } // namespace pytorch diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 7307833430411f..a73c13814d7663 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -61,7 +61,6 @@ OP_CONVERTER(translate_clamp); OP_CONVERTER(translate_col2im); OP_CONVERTER(translate_constant); OP_CONVERTER(translate_conv_transposend); -OP_CONVERTER(translate_conv1d_ext); OP_CONVERTER(translate_convnd); OP_CONVERTER(translate_convolution); OP_CONVERTER(translate_convolution_mode); @@ -77,7 +76,6 @@ OP_CONVERTER(translate_dot); OP_CONVERTER(translate_elu); OP_CONVERTER(translate_embedding); OP_CONVERTER(translate_embedding_bag); -OP_CONVERTER(translate_embedding_ext); OP_CONVERTER(translate_empty); OP_CONVERTER(translate_empty_like); OP_CONVERTER(translate_erf); @@ -325,6 +323,10 @@ OP_CONVERTER(translate_unbind_int_fx); OP_CONVERTER(translate_unique2); OP_CONVERTER(translate_zeros_fx); OP_CONVERTER(translate_zeros_like_fx); +// Extensions +OP_CONVERTER(translate_conv1d_ext); +OP_CONVERTER(translate_embedding_ext); +OP_CONVERTER(translate_linear_awq); } // namespace op @@ -699,6 +701,7 @@ const std::unordered_map get_supported_ops_ts() { {"aten::zero", op::translate_zeros_like}, {"aten::zeros", op::translate_zeros}, {"aten::zeros_like", op::translate_zeros_like}, + {"ov_ext::awq_gemm", op::translate_linear_awq}, {"ov_ext::embedding", op::translate_embedding_ext}, {"ov_ext::conv1d", op::translate_conv1d_ext}, {"ov_ext::linear", op::translate_linear}, @@ -864,7 +867,6 @@ const std::unordered_map get_supported_ops_fx() { {"aten.hardtanh.default", op::translate_hardtanh}, {"aten.hardtanh_.default", op::inplace_op}, {"aten.index.Tensor", op::translate_index_fx}, - {"aten._unsafe_index.Tensor", op::translate_index_fx}, {"aten.index_select.default", op::translate_index_select}, {"aten.isfinite.default", op::translate_1to1_match_1_inputs}, {"aten.isinf.default", op::translate_1to1_match_1_inputs}, diff --git a/src/frontends/pytorch/src/utils.cpp b/src/frontends/pytorch/src/utils.cpp index 752b9accb71d01..5cc7ec21f30911 100644 --- a/src/frontends/pytorch/src/utils.cpp +++ b/src/frontends/pytorch/src/utils.cpp @@ -42,7 +42,11 @@ using namespace ov::op; void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) { auto num_inputs = context.get_input_size(); - FRONT_END_OP_CONVERSION_CHECK(num_inputs >= min_inputs, "Got less inputs than expected"); + FRONT_END_OP_CONVERSION_CHECK(num_inputs >= min_inputs, + "Got less inputs ", + num_inputs, + " than expected ", + min_inputs); for (auto i = max_inputs; i < num_inputs; i++) { FRONT_END_OP_CONVERSION_CHECK(context.input_is_none(i), "Got more inputs than expected."); } diff --git a/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp b/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp index d506759fd33716..dab130cb381731 100644 --- a/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp +++ b/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp @@ -254,7 +254,7 @@ void CheckpointV1Reader::read_variable(const std::string& variable_name, ov::Any // This is only present at the first item of each checkpoint file and serves // as a table of contents, listing all the tensor slices saved in this file. - ::tensorflow::SavedTensorSlices sts; + ::tensorflow::SavedTensorSlices sts{}; FRONT_END_GENERAL_CHECK(sts.ParseFromArray(raw_data.data(), static_cast(raw_data.size())), "[TensorFlow Frontend] incorrect input checkpoint file or internal error: cannot parse " "SavedTensorSlices entry"); diff --git a/src/frontends/tensorflow/src/op/var_handle.cpp b/src/frontends/tensorflow/src/op/var_handle.cpp index f0077ae206bf6d..53fdf21d6086bf 100644 --- a/src/frontends/tensorflow/src/op/var_handle.cpp +++ b/src/frontends/tensorflow/src/op/var_handle.cpp @@ -98,7 +98,7 @@ OutputVector translate_varhandle_op(const NodeContext& node) { TENSORFLOW_OP_VALIDATION(node, result, "[TensorFlow Frontend] Internal error: Cannot find requested variable."); - ::tensorflow::BundleEntryProto entry; + ::tensorflow::BundleEntryProto entry{}; TENSORFLOW_OP_VALIDATION(node, entry.ParseFromArray(entry_data, static_cast(entry_size)), "[TensorFlow Frontend] Internal error: Cannot get read bundle entry."); diff --git a/src/frontends/tensorflow/src/op/xla_conv_v2.cpp b/src/frontends/tensorflow/src/op/xla_conv_v2.cpp index a01780d58cfeae..795f4deb3d93ef 100644 --- a/src/frontends/tensorflow/src/op/xla_conv_v2.cpp +++ b/src/frontends/tensorflow/src/op/xla_conv_v2.cpp @@ -111,7 +111,7 @@ OutputVector translate_xla_conv_v2_op(const NodeContext& node) { is_all_one, "[TensorFlow Frontend] internal error: convolutional kernel with holes is not supported"); - ConvolutionDimensionNumbers dimension_numbers; + ConvolutionDimensionNumbers dimension_numbers{}; TENSORFLOW_OP_VALIDATION( node, dimension_numbers.ParseFromArray(dimension_numbers_message.data(), diff --git a/src/frontends/tensorflow/src/variables_index.cpp b/src/frontends/tensorflow/src/variables_index.cpp index 778f8b2f94bb7c..f4e330518e20c2 100644 --- a/src/frontends/tensorflow/src/variables_index.cpp +++ b/src/frontends/tensorflow/src/variables_index.cpp @@ -128,7 +128,7 @@ void VariablesIndex::read_bundle_header() { auto item = m_variables_index.find(""); FRONT_END_GENERAL_CHECK(item != m_variables_index.end(), "Bundle Header isn't found in index"); - ::tensorflow::BundleHeaderProto bundleHeader; + ::tensorflow::BundleHeaderProto bundleHeader{}; FRONT_END_GENERAL_CHECK(bundleHeader.ParseFromArray(item->second.data(), static_cast(item->second.size())), "Bundle Header: Cannot parse Bundle Header"); FRONT_END_GENERAL_CHECK(bundleHeader.version().producer() == 1, "Bundle Header: Unsupported producer version"); @@ -147,7 +147,7 @@ void VariablesIndex::read_checkpointable_object_graph() { return; } - ::tensorflow::BundleEntryProto entry; + ::tensorflow::BundleEntryProto entry{}; FRONT_END_GENERAL_CHECK(entry.ParseFromArray(item->second.data(), static_cast(item->second.size())), "CMO: Cannot parse Bundle Entry"); diff --git a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp index 01f7b556da909f..3a3d5d9910305f 100644 --- a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp +++ b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp @@ -136,11 +136,11 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< /** * @brief Release intermediate memory - * + * */ virtual void release_memory(); - virtual ~ICompiledModel() = default; + virtual ~ICompiledModel(); private: std::shared_ptr m_plugin; diff --git a/src/inference/dev_api/openvino/runtime/internal_properties.hpp b/src/inference/dev_api/openvino/runtime/internal_properties.hpp index 60d6b66cfda897..bec304104581ac 100644 --- a/src/inference/dev_api/openvino/runtime/internal_properties.hpp +++ b/src/inference/dev_api/openvino/runtime/internal_properties.hpp @@ -9,6 +9,7 @@ #pragma once +#include "openvino/runtime/aligned_buffer.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" @@ -36,6 +37,12 @@ static constexpr Property, PropertyMutability::RO> cac */ static constexpr Property caching_with_mmap{"CACHING_WITH_MMAP"}; +/** + * @brief Property to get a ov::AlignedBuffer with cached model + * @ingroup ov_dev_api_plugin_api + */ +static constexpr Property, PropertyMutability::RW> cached_model_buffer{"CACHED_MODEL_BUFFER"}; + /** * @brief Allow to create exclusive_async_requests with one executor * @ingroup ov_dev_api_plugin_api diff --git a/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp b/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp index bbe81ee1c9602d..b80bf32958e4ac 100644 --- a/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp +++ b/src/inference/dev_api/openvino/runtime/isync_infer_request.hpp @@ -15,6 +15,7 @@ #include #include "openvino/core/descriptor/tensor.hpp" +#include "openvino/core/descriptor_tensor.hpp" #include "openvino/runtime/common.hpp" #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/profiling_info.hpp" @@ -162,7 +163,11 @@ class OPENVINO_RUNTIME_API ISyncInferRequest : public IInferRequest { private: std::shared_ptr m_compiled_model; // Mutable to return reference to ov::Tensor - mutable std::unordered_map, ov::SoPtr> m_tensors; + mutable std::unordered_map, + ov::SoPtr, + descriptor::TensorExtension::Hasher, + descriptor::TensorExtension::Equal> + m_tensors; // Cache ports mutable std::unordered_map m_cached_ports; mutable std::mutex m_cache_mutex; diff --git a/src/inference/include/openvino/runtime/core.hpp b/src/inference/include/openvino/runtime/core.hpp index f0ba27c1cf5daa..c13432d664e736 100644 --- a/src/inference/include/openvino/runtime/core.hpp +++ b/src/inference/include/openvino/runtime/core.hpp @@ -25,6 +25,10 @@ #include "openvino/runtime/remote_context.hpp" #include "openvino/runtime/tensor.hpp" +#ifdef OPENVINO_CPP_VER_17 +# include +#endif + namespace ov { /** @@ -95,9 +99,18 @@ class OPENVINO_RUNTIME_API Core { * * TF (*.pb) * * TFLite (*.tflite) * @return A model. + * @{ */ std::shared_ptr read_model(const std::string& model_path, const std::string& bin_path = {}) const; +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + std::shared_ptr read_model(const Path& model_path, const Path& bin_path = {}) const { + return read_model(model_path.string(), bin_path.string()); + } +#endif + /// @} + /** * @brief Reads models from IR / ONNX / PDPD / TF / TFLite formats. * @param model String with a model in IR / ONNX / PDPD / TF / TFLite format. @@ -197,6 +210,13 @@ class OPENVINO_RUNTIME_API Core { */ CompiledModel compile_model(const std::string& model_path, const AnyMap& properties = {}); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const AnyMap& properties = {}) const { + return compile_model(model_path.string(), properties); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT CompiledModel compile_model(const std::wstring& model_path, const AnyMap& properties = {}); #endif @@ -223,6 +243,13 @@ class OPENVINO_RUNTIME_API Core { return compile_model(model_path, AnyMap{std::forward(properties)...}); } +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, Properties&&... properties) { + return compile_model(model_path.string(), std::forward(properties)...); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT template util::EnableIfAllStringAny compile_model(const std::wstring& model_path, @@ -250,6 +277,13 @@ class OPENVINO_RUNTIME_API Core { const std::string& device_name, const AnyMap& properties = {}); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const std::string& device_name, const AnyMap& properties = {}) { + return compile_model(model_path.string(), device_name, properties); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT CompiledModel compile_model(const std::wstring& model_path, const std::string& device_name, @@ -279,6 +313,13 @@ class OPENVINO_RUNTIME_API Core { return compile_model(model_path, device_name, AnyMap{std::forward(properties)...}); } +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + auto compile_model(const Path& model_path, const std::string& device_name, Properties&&... properties) { + return compile_model(model_path.string(), device_name, std::forward(properties)...); + } +#endif + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT template util::EnableIfAllStringAny compile_model(const std::wstring& model_path, @@ -359,9 +400,18 @@ class OPENVINO_RUNTIME_API Core { /** * @brief Registers an extension to a Core object. * @param library_path Path to the library with ov::Extension. + * @{ */ void add_extension(const std::string& library_path); +#ifdef OPENVINO_CPP_VER_17 + template >* = nullptr> + void add_extension(const Path& model_path) { + add_extension(model_path.string()); + } +#endif + /// @} + #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT /** * @brief Registers an extension to a Core object. diff --git a/src/inference/include/openvino/runtime/intel_npu/properties.hpp b/src/inference/include/openvino/runtime/intel_npu/properties.hpp index 49416f61b8b43b..8734757da1d53d 100644 --- a/src/inference/include/openvino/runtime/intel_npu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_npu/properties.hpp @@ -95,5 +95,12 @@ static constexpr ov::Property max_tiles{"NPU_MAX_TILES"}; */ static constexpr ov::Property bypass_umd_caching{"NPU_BYPASS_UMD_CACHING"}; +/** + * @brief [Only for NPU Plugin] + * Type: boolean, default is false + * This option allows to delay loading the weights until inference is created + */ +static constexpr ov::Property defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"}; + } // namespace intel_npu } // namespace ov diff --git a/src/inference/src/cache_manager.hpp b/src/inference/src/cache_manager.hpp index c441811c3cfd02..82813e5dd4788f 100644 --- a/src/inference/src/cache_manager.hpp +++ b/src/inference/src/cache_manager.hpp @@ -69,7 +69,7 @@ class ICacheManager { /** * @brief Function passing created input stream */ - using StreamReader = std::function; + using StreamReader = std::function)>; /** * @brief Callback when OpenVINO intends to read model from cache @@ -143,10 +143,10 @@ class FileStorageCacheManager final : public ICacheManager { std::make_shared>>(mmap->data(), mmap->size(), mmap); OwningSharedStreamBuffer buf(shared_buffer); std::istream stream(&buf); - reader(stream); + reader(stream, shared_buffer); } else { std::ifstream stream(blob_file_name, std::ios_base::binary); - reader(stream); + reader(stream, nullptr); } } } diff --git a/src/inference/src/cpp/compiled_model.cpp b/src/inference/src/cpp/compiled_model.cpp index d675cba4714887..c780bbee1e991d 100644 --- a/src/inference/src/cpp/compiled_model.cpp +++ b/src/inference/src/cpp/compiled_model.cpp @@ -8,10 +8,6 @@ #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/properties.hpp" -#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) -# include -#endif - #define OV_COMPILED_MODEL_CALL_STATEMENT(...) \ if (_impl == nullptr) \ OPENVINO_THROW("CompiledModel was not initialized."); \ @@ -27,12 +23,6 @@ namespace ov { CompiledModel::~CompiledModel() { _impl = {}; -#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) - // Linux memory margent doesn't return system memory immediate after release. - // It depends on memory chunk size and allocation history. - // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process. - malloc_trim(0); -#endif } CompiledModel::CompiledModel(const std::shared_ptr& impl, const std::shared_ptr& so) diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp index 244d27b5eebb67..673f6fd569a11e 100644 --- a/src/inference/src/dev/core_impl.cpp +++ b/src/inference/src/dev/core_impl.cpp @@ -1413,7 +1413,7 @@ ov::SoPtr ov::CoreImpl::load_model_from_cache( cacheContent.blobId, coreConfig.get_enable_mmap() && ov::util::contains(plugin.get_property(ov::internal::supported_properties), ov::internal::caching_with_mmap), - [&](std::istream& networkStream) { + [&](std::istream& networkStream, std::shared_ptr model_buffer) { OV_ITT_SCOPE(FIRST_INFERENCE, ov::itt::domains::LoadTime, "Core::load_model_from_cache::ReadStreamAndImport"); @@ -1459,6 +1459,9 @@ ov::SoPtr ov::CoreImpl::load_model_from_cache( update_config[ov::weights_path.name()] = weights_path; } } + if (model_buffer) { + update_config[ov::internal::cached_model_buffer.name()] = model_buffer; + } compiled_model = context ? plugin.import_model(networkStream, context, update_config) : plugin.import_model(networkStream, update_config); }); diff --git a/src/inference/src/dev/icompiled_model.cpp b/src/inference/src/dev/icompiled_model.cpp index b1cbedac1632ab..3f4a8d397ab4d9 100644 --- a/src/inference/src/dev/icompiled_model.cpp +++ b/src/inference/src/dev/icompiled_model.cpp @@ -10,6 +10,10 @@ #include "openvino/runtime/properties.hpp" #include "transformations/utils/utils.hpp" +#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) +# include +#endif + ov::ICompiledModel::ICompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const std::shared_ptr& task_executor, @@ -47,8 +51,11 @@ ov::ICompiledModel::ICompiledModel(const std::shared_ptr& model } } } - - std::unordered_map, std::shared_ptr> tensor_map; + std::unordered_map, + std::shared_ptr, + descriptor::TensorExtension::Hasher, + descriptor::TensorExtension::Equal> + tensor_map; for (const auto& param : model->get_parameters()) { const auto& param_name = param->get_friendly_name(); auto new_param = ov::as_type_ptr(param->copy_with_new_inputs({})); @@ -151,3 +158,12 @@ void ov::ICompiledModel::set_model_shared_object(ov::Model& model, const std::sh void ov::ICompiledModel::release_memory() { // nothing to do } + +ov::ICompiledModel::~ICompiledModel() { +#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) + // Linux memory margent doesn't return system memory immediate after release. + // It depends on memory chunk size and allocation history. + // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process. + malloc_trim(0); +#endif +} diff --git a/src/inference/src/model_reader.cpp b/src/inference/src/model_reader.cpp index aaf620ea0f803a..7babef019b5802 100644 --- a/src/inference/src/model_reader.cpp +++ b/src/inference/src/model_reader.cpp @@ -86,7 +86,7 @@ void update_v10_model(std::shared_ptr& model, bool frontendMode = fal "Model operation names have collisions with tensor names.", " Please use MO to generate new IR version, it should allow to avoid the issue"); leaf_names.emplace(res_name, nullptr); - result->output(0).get_tensor().add_names({std::move(res_name)}); + result->input(0).get_tensor().add_names({std::move(res_name)}); } for (const auto& param : model->get_parameters()) { const auto& param_name = param->get_friendly_name(); diff --git a/src/inference/tests/functional/caching_test.cpp b/src/inference/tests/functional/caching_test.cpp index 5b01af9a22cde8..6b1c7f938ae731 100644 --- a/src/inference/tests/functional/caching_test.cpp +++ b/src/inference/tests/functional/caching_test.cpp @@ -2424,6 +2424,142 @@ TEST_P(CachingTest, Load_threads) { std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; } +TEST_P(CachingTest, Load_mmap) { + ON_CALL(*mockPlugin, import_model(_, _)).WillByDefault(Invoke([&](std::istream& istr, const ov::AnyMap& config) { + if (m_checkConfigCb) { + m_checkConfigCb(config); + } + std::shared_ptr model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_TRUE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + + ON_CALL(*mockPlugin, get_property(ov::internal::supported_properties.name(), _)) + .WillByDefault(Invoke([&](const std::string&, const ov::AnyMap&) { + return std::vector{ov::internal::caching_properties.name(), + ov::internal::caching_with_mmap.name()}; + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + core.set_property({{ov::cache_dir.name(), m_cacheDir}}); + m_testFunction(core); + m_testFunction(core); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + +TEST_P(CachingTest, Load_mmap_is_disabled) { + ON_CALL(*mockPlugin, import_model(_, _)).WillByDefault(Invoke([&](std::istream& istr, const ov::AnyMap& config) { + if (m_checkConfigCb) { + m_checkConfigCb(config); + } + std::shared_ptr model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_FALSE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + ON_CALL(*mockPlugin, get_property(ov::internal::supported_properties.name(), _)) + .WillByDefault(Invoke([&](const std::string&, const ov::AnyMap&) { + return std::vector{ov::internal::caching_properties.name(), + ov::internal::caching_with_mmap.name()}; + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + core.set_property({{ov::cache_dir.name(), m_cacheDir}}); + core.set_property({ov::enable_mmap(false)}); + m_testFunction(core); + m_testFunction(core); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + +TEST_P(CachingTest, Load_mmap_is_not_supported_by_plugin) { + ON_CALL(*mockPlugin, import_model(_, _)).WillByDefault(Invoke([&](std::istream& istr, const ov::AnyMap& config) { + if (m_checkConfigCb) { + m_checkConfigCb(config); + } + std::shared_ptr model_buffer; + if (config.count(ov::internal::cached_model_buffer.name())) + model_buffer = config.at(ov::internal::cached_model_buffer.name()).as>(); + EXPECT_FALSE(model_buffer); + + std::string name; + istr >> name; + char space; + istr.read(&space, 1); + std::lock_guard lock(mock_creation_mutex); + return create_mock_compiled_model(m_models[name], mockPlugin); + })); + EXPECT_CALL(*mockPlugin, get_property(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, query_model(_, _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::device::architecture.name(), _)).Times(AnyNumber()); + EXPECT_CALL(*mockPlugin, get_property(ov::internal::caching_properties.name(), _)).Times(AnyNumber()); + if (m_remoteContext) { + return; // skip the remote Context test for Multi plugin + } + int index = 0; + m_post_mock_net_callbacks.emplace_back([&](MockICompiledModelImpl& net) { + EXPECT_CALL(net, export_model(_)).Times(1); + }); + MkDirGuard guard(m_cacheDir); + EXPECT_CALL(*mockPlugin, compile_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, compile_model(A&>(), _)).Times(1); + EXPECT_CALL(*mockPlugin, import_model(_, _, _)).Times(0); + EXPECT_CALL(*mockPlugin, import_model(_, _)).Times(1); + testLoad([&](ov::Core& core) { + core.set_property({{ov::cache_dir.name(), m_cacheDir}}); + core.set_property({ov::enable_mmap(true)}); + m_testFunction(core); + m_testFunction(core); + }); + std::cout << "Caching Load multiple threads test completed. Tried " << index << " times" << std::endl; +} + #if defined(ENABLE_OV_IR_FRONTEND) static std::string getTestCaseName(const testing::TestParamInfo>& obj) { diff --git a/src/inference/tests/functional/ov_core_test.cpp b/src/inference/tests/functional/ov_core_test.cpp index 26eb38e3fd13e5..60f91b85b3338a 100644 --- a/src/inference/tests/functional/ov_core_test.cpp +++ b/src/inference/tests/functional/ov_core_test.cpp @@ -8,9 +8,26 @@ #include "common_test_utils/common_utils.hpp" #include "common_test_utils/file_utils.hpp" +#include "functional_test_utils/test_model/test_model.hpp" #include "openvino/runtime/core.hpp" #include "openvino/util/file_util.hpp" +class CoreBaseTest : public testing::Test { +protected: + void generate_test_model_files(const std::string& name) { + auto prefix = ov::test::utils::generateTestFilePrefix(); + model_file_name = prefix + name + ".xml"; + weight_file_name = prefix + name + ".bin"; + ov::test::utils::generate_test_model(model_file_name, weight_file_name); + } + + void TearDown() override { + ov::test::utils::removeIRFiles(model_file_name, weight_file_name); + } + + std::string model_file_name, weight_file_name; +}; + #ifndef OPENVINO_STATIC_LIBRARY static void create_plugin_xml(const std::string& file_name, const std::string& plugin_name = "1") { @@ -33,7 +50,7 @@ static void remove_plugin_xml(const std::string& file_name) { ov::test::utils::removeFile(file_name); } -TEST(CoreBaseTest, LoadPluginXML) { +TEST_F(CoreBaseTest, LoadPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -42,7 +59,7 @@ TEST(CoreBaseTest, LoadPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) { +TEST_F(CoreBaseTest, LoadPluginDifferentXMLExtension) { std::string xml_file_name = "test_plugin.test"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -51,7 +68,7 @@ TEST(CoreBaseTest, LoadPluginDifferentXMLExtension) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { +TEST_F(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getOpenvinoLibDirectory() + ov::util::FileTraits::file_separator + xml_file_name; @@ -60,7 +77,7 @@ TEST(CoreBaseTest, LoadAbsoluteOVPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -69,7 +86,7 @@ TEST(CoreBaseTest, LoadAbsoluteCWPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadRelativeCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string xml_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -78,7 +95,7 @@ TEST(CoreBaseTest, LoadRelativeCWPathPluginXML) { remove_plugin_xml(xml_file_path); } -TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { +TEST_F(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { std::string xml_file_name = "test_plugin.xml"; std::string cwd_file_path = ov::test::utils::getCurrentWorkingDir() + ov::util::FileTraits::file_separator + xml_file_name; @@ -96,3 +113,43 @@ TEST(CoreBaseTest, LoadOVFolderOverCWPathPluginXML) { } #endif + +#if defined(OPENVINO_CPP_VER_17) && defined(ENABLE_OV_IR_FRONTEND) +namespace ov::test { +TEST_F(CoreBaseTest, read_model_with_std_fs_path) { + generate_test_model_files("test-model"); + + const auto model_path = std::filesystem::path(model_file_name); + const auto weight_path = std::filesystem::path(weight_file_name); + + ov::Core core; + { + const auto model = core.read_model(model_path); + EXPECT_NE(model, nullptr); + } + { + const auto model = core.read_model(model_path, weight_path); + EXPECT_NE(model, nullptr); + } +} + +TEST_F(CoreBaseTest, compile_model_with_std_fs_path) { + generate_test_model_files("model2"); + + const auto model_path = std::filesystem::path(model_file_name); + const auto weight_path = std::filesystem::path(weight_file_name); + + ov::Core core; + { + const auto model = core.compile_model(model_path); + EXPECT_TRUE(model); + } + { + const auto devices = core.get_available_devices(); + + const auto model = core.compile_model(model_path, devices.at(0), ov::AnyMap{}); + EXPECT_TRUE(model); + } +} +} // namespace ov::test +#endif diff --git a/src/inference/tests/functional/ov_extension_test.cpp b/src/inference/tests/functional/ov_extension_test.cpp index 6f93a8acdaf2fa..b840c430d092e9 100644 --- a/src/inference/tests/functional/ov_extension_test.cpp +++ b/src/inference/tests/functional/ov_extension_test.cpp @@ -82,6 +82,12 @@ class CustomReLU : public ov::op::Op { }; #if defined(ENABLE_OV_IR_FRONTEND) +# ifdef OPENVINO_CPP_VER_17 +TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsPathLib) { + core.add_extension(std::filesystem::path(getOVExtensionPath())); + test(); +} +# endif TEST_F(OVExtensionTests, ReshapeIRWithNewExtensionsLib) { core.add_extension(getOVExtensionPath()); diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 04909c7d8f5a5a..eb56a3fb39503e 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -242,7 +242,8 @@ ov_add_plugin(NAME ${TARGET_NAME} DEVICE_NAME "CPU" AS_EXTENSION VERSION_DEFINES_FOR src/plugin.cpp - SOURCES ${SOURCES} ${HEADERS}) + SOURCES ${SOURCES} ${HEADERS} + ADD_CLANG_FORMAT) # give a different file name depending on target platform architecture if(ARM OR AARCH64) diff --git a/src/plugins/intel_cpu/src/cache/cache_entry.h b/src/plugins/intel_cpu/src/cache/cache_entry.h index 135a1090a60045..6e71e207b0a71c 100644 --- a/src/plugins/intel_cpu/src/cache/cache_entry.h +++ b/src/plugins/intel_cpu/src/cache/cache_entry.h @@ -4,8 +4,9 @@ #pragma once -#include #include +#include + #include "lru_cache.h" namespace ov { @@ -13,27 +14,24 @@ namespace intel_cpu { class CacheEntryBase { public: - enum class LookUpStatus : int8_t { - Hit, - Miss - }; + enum class LookUpStatus : int8_t { Hit, Miss }; + public: virtual ~CacheEntryBase() = default; }; /** * @brief Class represents a templated record in multi cache - * @tparam KeyType is a key type that must define hash() const method with return type convertible to size_t and define comparison operator. + * @tparam KeyType is a key type that must define hash() const method with return type convertible to size_t and define + * comparison operator. * @tparam ValType is a type that must meet all the requirements to the std::unordered_map mapped type - * @tparam ImplType is a type for the internal storage. It must provide put(KeyType, ValueType) and ValueType get(const KeyType&) - * interface and must have constructor of type ImplType(size_t). + * @tparam ImplType is a type for the internal storage. It must provide put(KeyType, ValueType) and ValueType get(const + * KeyType&) interface and must have constructor of type ImplType(size_t). * * @note In this implementation default constructed value objects are treated as empty objects. */ -template> +template > class CacheEntry : public CacheEntryBase { public: using ResultType = std::pair; @@ -42,11 +40,12 @@ class CacheEntry : public CacheEntryBase { explicit CacheEntry(size_t capacity) : _impl(capacity) {} /** - * @brief Searches the key in the underlying storage and returns value if it exists, or creates a value using the builder functor and adds it to - * the underlying storage. + * @brief Searches the key in the underlying storage and returns value if it exists, or creates a value using the + * builder functor and adds it to the underlying storage. * @param key is the search key * @param builder is a callable object that creates the ValType object from the KeyType lval reference - * @return result of the operation which is a pair of the requested object of ValType and the status of whether the cache hit or miss occurred + * @return result of the operation which is a pair of the requested object of ValType and the status of whether the + * cache hit or miss occurred */ ResultType getOrCreate(const KeyType& key, std::function builder) { @@ -70,5 +69,5 @@ class CacheEntry : public CacheEntryBase { ImplType _impl; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cache/lru_cache.h b/src/plugins/intel_cpu/src/cache/lru_cache.h index 792451da16c484..c3a4d47aa9de9f 100644 --- a/src/plugins/intel_cpu/src/cache/lru_cache.h +++ b/src/plugins/intel_cpu/src/cache/lru_cache.h @@ -10,7 +10,8 @@ /** * @brief This is yet another implementation of a preemptive cache with LRU eviction policy. - * @tparam Key is a key type that must define hash() const method with return type convertible to size_t and define comparison operator. + * @tparam Key is a key type that must define hash() const method with return type convertible to size_t and define + * comparison operator. * @tparam Value is a type that must meet all the requirements to the std::unordered_map mapped type * * @attention This cache implementation IS NOT THREAD SAFE! @@ -19,7 +20,7 @@ namespace ov { namespace intel_cpu { -template +template class LruCache { public: using value_type = std::pair; @@ -33,7 +34,7 @@ class LruCache { * @param value */ - void put(const Key &key, const Value &val) { + void put(const Key& key, const Value& val) { if (0 == _capacity) { return; } @@ -56,7 +57,7 @@ class LruCache { * @return Value associated with the key or default constructed instance of the Value type. */ - Value get(const Key &key) { + Value get(const Key& key) { auto itr = _cacheMapper.find(key); if (itr == _cacheMapper.end()) { return Value(); @@ -82,13 +83,13 @@ class LruCache { * @brief Returns the current capacity value * @return the current capacity value */ - size_t getCapacity() const noexcept { - return _capacity; - } + size_t getCapacity() const noexcept { + return _capacity; + } private: struct key_hasher { - std::size_t operator()(const Key &k) const { + std::size_t operator()(const Key& k) const { return k.hash(); } }; @@ -105,5 +106,5 @@ class LruCache { size_t _capacity; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cache/multi_cache.cpp b/src/plugins/intel_cpu/src/cache/multi_cache.cpp index 29dad18a41c770..325dfb517831b5 100644 --- a/src/plugins/intel_cpu/src/cache/multi_cache.cpp +++ b/src/plugins/intel_cpu/src/cache/multi_cache.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { std::atomic_size_t MultiCache::_typeIdCounter{0}; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cache/multi_cache.h b/src/plugins/intel_cpu/src/cache/multi_cache.h index d9b6e5f8bfe19a..e216efe6fea801 100644 --- a/src/plugins/intel_cpu/src/cache/multi_cache.h +++ b/src/plugins/intel_cpu/src/cache/multi_cache.h @@ -4,9 +4,10 @@ #pragma once +#include #include #include -#include + #include "cache_entry.h" namespace ov { @@ -20,27 +21,28 @@ namespace intel_cpu { class MultiCache { public: - template + template using EntryTypeT = CacheEntry; using EntryBasePtr = std::shared_ptr; - template + template using EntryPtr = std::shared_ptr>; public: /** - * @param capacity here means maximum records limit FOR EACH entry specified by a pair of Key/Value types. - * @note zero capacity means empty cache so no records are stored and no entries are created - */ + * @param capacity here means maximum records limit FOR EACH entry specified by a pair of Key/Value types. + * @note zero capacity means empty cache so no records are stored and no entries are created + */ explicit MultiCache(size_t capacity) : _capacity(capacity) {} /** - * @brief Searches a value of ValueType in the cache using the provided key or creates a new ValueType instance (if nothing was found) - * using the key and the builder functor and adds the new record to the cache - * @param key is the search key - * @param builder is a callable object that creates the ValType object from the KeyType lval reference. - * Also the builder type is used for the ValueType deduction - * @return result of the operation which is a pair of the requested object of ValType and the status of whether the cache hit or miss occurred - */ + * @brief Searches a value of ValueType in the cache using the provided key or creates a new ValueType instance (if + * nothing was found) using the key and the builder functor and adds the new record to the cache + * @param key is the search key + * @param builder is a callable object that creates the ValType object from the KeyType lval reference. + * Also the builder type is used for the ValueType deduction + * @return result of the operation which is a pair of the requested object of ValType and the status of whether the + * cache hit or miss occurred + */ template 201703L)) || (defined(__cplusplus) && (__cplusplus > 201703L)) @@ -54,9 +56,9 @@ class MultiCache { } private: - template + template size_t getTypeId(); - template + template EntryPtr getEntry(); private: @@ -65,13 +67,13 @@ class MultiCache { std::unordered_map _storage; }; -template +template size_t MultiCache::getTypeId() { static size_t id = _typeIdCounter.fetch_add(1); return id; } -template +template MultiCache::EntryPtr MultiCache::getEntry() { using EntryType = EntryTypeT; size_t id = getTypeId(); @@ -88,5 +90,5 @@ using MultiCacheWeakCPtr = std::weak_ptr; using MultiCachePtr = std::shared_ptr; using MultiCacheCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index bbee5d937be5d5..f0f9668ca3ae50 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -3,29 +3,30 @@ // #include "compiled_model.h" + +#include +#include + #include "async_infer_request.h" +#include "cpu/x64/cpu_isa_traits.hpp" #include "infer_request.h" #include "itt.h" #include "low_precision/low_precision.hpp" #include "memory_state.h" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" -#include "openvino/runtime/threading/executor_manager.hpp" -#include "transformations/transformation_pipeline.h" #include "openvino/runtime/properties.hpp" -#include "openvino/util/common_util.hpp" +#include "openvino/runtime/threading/cpu_message.hpp" #include "openvino/runtime/threading/cpu_streams_executor.hpp" -#include "transformations/utils/utils.hpp" #include "openvino/runtime/threading/cpu_streams_info.hpp" -#include "openvino/runtime/threading/cpu_message.hpp" +#include "openvino/runtime/threading/executor_manager.hpp" +#include "openvino/util/common_util.hpp" +#include "transformations/transformation_pipeline.h" +#include "transformations/utils/utils.hpp" #include "utils/serialize.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" -#include -#include - #if defined(OV_CPU_WITH_ACL) -#include "nodes/executors/acl/acl_ie_scheduler.hpp" +# include "nodes/executors/acl/acl_ie_scheduler.hpp" #endif using namespace ov::threading; @@ -329,8 +330,7 @@ ov::Any CompiledModel::get_property(const std::string& name) const { return decltype(ov::intel_cpu::sparse_weights_decompression_rate)::value_type( config.fcSparseWeiDecompressionRate); } else if (name == ov::hint::dynamic_quantization_group_size) { - return decltype(ov::hint::dynamic_quantization_group_size)::value_type( - config.fcDynamicQuantizationGroupSize); + return decltype(ov::hint::dynamic_quantization_group_size)::value_type(config.fcDynamicQuantizationGroupSize); } else if (name == ov::hint::kv_cache_precision) { return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision); } diff --git a/src/plugins/intel_cpu/src/compiled_model.h b/src/plugins/intel_cpu/src/compiled_model.h index faedf1ae5a744c..dc3735b4f3b63e 100644 --- a/src/plugins/intel_cpu/src/compiled_model.h +++ b/src/plugins/intel_cpu/src/compiled_model.h @@ -94,5 +94,5 @@ class CompiledModel : public ov::ICompiledModel { bool m_has_sub_compiled_models = false; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 7ce4c1069e695d..7d1ee05897e81d 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -4,19 +4,19 @@ #include "config.h" +#include +#include +#include + #include "cpu/x64/cpu_isa_traits.hpp" #include "openvino/core/parallel.hpp" #include "openvino/core/type/element_type_traits.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" #include "openvino/runtime/internal_properties.hpp" #include "openvino/runtime/properties.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/precision_support.h" -#include "utils/cpu_utils.hpp" - -#include -#include -#include namespace ov { namespace intel_cpu { @@ -61,9 +61,7 @@ Config::Config() { */ void Config::applyDebugCapsProperties() { // always enable perf counters for verbose, performance summary and average counters - if (!debugCaps.verbose.empty() || - !debugCaps.summaryPerf.empty() || - !debugCaps.averageCountersPath.empty()) { + if (!debugCaps.verbose.empty() || !debugCaps.summaryPerf.empty() || !debugCaps.averageCountersPath.empty()) { collectPerfCounters = true; } } @@ -151,10 +149,10 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { logLevel = val.as(); } catch (const ov::Exception&) { OPENVINO_THROW("Wrong value ", - val.as(), - " for property key ", - key, - ". Expected only ov::log::Level::NO/ERR/WARNING/INFO/DEBUG/TRACE."); + val.as(), + " for property key ", + key, + ". Expected only ov::log::Level::NO/ERR/WARNING/INFO/DEBUG/TRACE."); } } else if (key == ov::hint::num_requests.name()) { try { @@ -243,8 +241,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { fcDynamicQuantizationGroupSize = val.as(); } catch (const ov::Exception&) { OPENVINO_THROW("Wrong value for property key ", - ov::hint::dynamic_quantization_group_size.name(), - ". Expected only unsinged integer numbers"); + ov::hint::dynamic_quantization_group_size.name(), + ". Expected only unsinged integer numbers"); } } else if (key == ov::enable_profiling.name()) { try { @@ -366,7 +364,7 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) { kvCachePrecision = prec; } else { - OPENVINO_THROW("invalid value"); + OPENVINO_THROW("invalid value"); } } catch (ov::Exception&) { OPENVINO_THROW("Wrong value ", @@ -460,5 +458,19 @@ void Config::updateProperties() { _config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)}); } +void Config::applyRtInfo(const std::shared_ptr& model) { + // if user sets explicitly, it will be higher priority than rt_info + if (!kvCachePrecisionSetExplicitly && + model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) { + this->kvCachePrecision = + model->get_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()}); + } + if (!fcDynamicQuantizationGroupSizeSetExplicitly && + model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) { + this->fcDynamicQuantizationGroupSize = + model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 5f4bb25ede350e..1aa08f4412f0b3 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -4,18 +4,17 @@ #pragma once +#include +#include +#include + +#include "internal_properties.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" #include "openvino/util/common_util.hpp" - -#include "internal_properties.hpp" #include "utils/debug_caps_config.h" -#include -#include -#include - namespace ov { namespace intel_cpu { struct Config { @@ -38,11 +37,7 @@ struct Config { Disable, }; - enum class ModelType { - CNN, - LLM, - Unknown - }; + enum class ModelType { CNN, LLM, Unknown }; bool collectPerfCounters = false; bool exclusiveAsyncRequests = false; @@ -69,7 +64,8 @@ struct Config { bool streamsChanged = false; int threads = 0; int threadsPerStream = 0; - ov::threading::IStreamsExecutor::ThreadBindingType threadBindingType = ov::threading::IStreamsExecutor::ThreadBindingType::NONE; + ov::threading::IStreamsExecutor::ThreadBindingType threadBindingType = + ov::threading::IStreamsExecutor::ThreadBindingType::NONE; ov::hint::PerformanceMode hintPerfMode = ov::hint::PerformanceMode::LATENCY; std::vector> streamsRankTable; bool changedHintPerfMode = false; @@ -106,6 +102,8 @@ struct Config { void updateProperties(); + void applyRtInfo(const std::shared_ptr& model); + std::map _config; int modelPreferThreads = -1; @@ -120,4 +118,4 @@ struct Config { }; } // namespace intel_cpu -} // namespace ov +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp index 8e5fe8d72fd1f2..7cb4abc2161f14 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.cpp +++ b/src/plugins/intel_cpu/src/cpu_memory.cpp @@ -3,14 +3,17 @@ // #include "cpu_memory.h" -#include "memory_desc/cpu_memory_desc_utils.h" + #include -#include "nodes/reorder.h" + +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/common/cpu_memcpy.h" +#include "nodes/reorder.h" #include "utils/debug_capabilities.h" #if defined(__linux__) # include /* Definition of SYS_* constants */ # include + # include /* strerror(errno) */ #endif @@ -27,69 +30,72 @@ BlockedMemoryDescPtr IMemory::getDescWithType() const { } namespace { - inline void setSubnormalsToZero(float *data, size_t size) { - uint32_t *u32data = reinterpret_cast(data); - for (size_t i = 0; i < size; ++i) { - if ((u32data[i] & (0xFF << 23)) == 0) { - u32data[i] = 0; - } +inline void setSubnormalsToZero(float* data, size_t size) { + uint32_t* u32data = reinterpret_cast(data); + for (size_t i = 0; i < size; ++i) { + if ((u32data[i] & (0xFF << 23)) == 0) { + u32data[i] = 0; } } +} - void transferData(const IMemory& src, const IMemory& dst, bool ftz) { - node::Reorder::reorderData(src, dst); +void transferData(const IMemory& src, const IMemory& dst, bool ftz) { + node::Reorder::reorderData(src, dst); - if (!ftz) { - return; - } - if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() == ov::element::bf16) { + if (!ftz) { + return; + } + if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() == ov::element::bf16) { + return; + } + size_t offset = 0; + if (dst.getDesc().getType() & MemoryDescType::Dnnl) { + // here we can safely cast to DnnlMemoryDesc + auto dnnl_desc = dst.getDescWithType(); + auto desc = dnnl_desc->getDnnlDesc(); + dnnl::impl::memory_desc_wrapper wrapper(desc.get()); + offset = wrapper.offset0(); + if (wrapper.is_wino_desc() || wrapper.is_rnn_packed_desc()) { return; } - size_t offset = 0; - if (dst.getDesc().getType() & MemoryDescType::Dnnl) { - // here we can safely cast to DnnlMemoryDesc - auto dnnl_desc = dst.getDescWithType(); - auto desc = dnnl_desc->getDnnlDesc(); - dnnl::impl::memory_desc_wrapper wrapper(desc.get()); - offset = wrapper.offset0(); - if (wrapper.is_wino_desc() || wrapper.is_rnn_packed_desc()) { - return; - } - } - // actual FTZ - auto* memData = static_cast(dst.getData()); - memData += offset; - setSubnormalsToZero(memData, dst.getSize() / sizeof(float)); } + // actual FTZ + auto* memData = static_cast(dst.getData()); + memData += offset; + setSubnormalsToZero(memData, dst.getSize() / sizeof(float)); +} -} // namespace +} // namespace -Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) : - m_eng(eng), - m_pMemDesc(desc), - m_blockHandle(std::make_shared(make_unique()), this), - dnnlMemHandle(this) { - if (desc->getPrecision() == element::string) { - OPENVINO_THROW("[CPU] Memory object cannot be created for string data."); - } - create(m_pMemDesc, data, pads_zeroing); +Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) + : m_eng(eng), + m_pMemDesc(desc), + m_blockHandle(std::make_shared(make_unique()), this), + dnnlMemHandle(this) { + if (desc->getPrecision() == element::string) { + OPENVINO_THROW("[CPU] Memory object cannot be created for string data."); } + create(m_pMemDesc, data, pads_zeroing); +} -Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) : - Memory::Memory(eng, desc.clone(), data, pads_zeroing) {} - -Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, MemoryBlockPtr block) : - m_eng(eng), m_pMemDesc(desc), m_blockHandle(block, this), dnnlMemHandle(this) { - if (desc->getPrecision() == element::string) { - OPENVINO_THROW("[CPU] Memory object can't be created for string data."); - } - bool memAllocated = m_blockHandle->getRawPtr(); +Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) + : Memory::Memory(eng, desc.clone(), data, pads_zeroing) {} - create(desc, nullptr, !memAllocated); +Memory::Memory(const dnnl::engine& eng, MemoryDescPtr desc, MemoryBlockPtr block) + : m_eng(eng), + m_pMemDesc(desc), + m_blockHandle(block, this), + dnnlMemHandle(this) { + if (desc->getPrecision() == element::string) { + OPENVINO_THROW("[CPU] Memory object can't be created for string data."); } + bool memAllocated = m_blockHandle->getRawPtr(); -Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, MemoryBlockPtr block) : - Memory::Memory(eng, desc.clone(), block) {} + create(desc, nullptr, !memAllocated); +} + +Memory::Memory(const dnnl::engine& eng, const MemoryDesc& desc, MemoryBlockPtr block) + : Memory::Memory(eng, desc.clone(), block) {} size_t Memory::getSize() const { auto size = getDesc().getCurrentMemSize(); @@ -99,7 +105,7 @@ size_t Memory::getSize() const { return size; } -void Memory::create(const MemoryDesc &desc, const void *data, bool pads_zeroing) { +void Memory::create(const MemoryDesc& desc, const void* data, bool pads_zeroing) { create(desc.clone(), data, pads_zeroing); } @@ -187,9 +193,7 @@ dnnl::memory Memory::DnnlMemPrimHandle::getPrim() const { void* Memory::getData() const { void* data = getDataNoThrow(); - if (data == nullptr && - m_pMemDesc->getShape().isStatic() && - m_pMemDesc->getShape().getElementsCount() != 0) + if (data == nullptr && m_pMemDesc->getShape().isStatic() && m_pMemDesc->getShape().getElementsCount() != 0) OPENVINO_THROW("Memory has not been allocated"); return data; } @@ -198,7 +202,7 @@ void* MemoryBlockWithReuse::getRawPtr() const noexcept { return m_data.get(); } -void MemoryBlockWithReuse::setExtBuff(void *ptr, size_t size) { +void MemoryBlockWithReuse::setExtBuff(void* ptr, size_t size) { m_useExternalStorage = true; m_memUpperBound = size; m_data = decltype(m_data)(ptr, release); @@ -208,7 +212,7 @@ bool MemoryBlockWithReuse::resize(size_t size) { constexpr int cacheLineSize = 64; bool sizeChanged = false; if (size > m_memUpperBound) { - void *ptr = dnnl::impl::malloc(size, cacheLineSize); + void* ptr = dnnl::impl::malloc(size, cacheLineSize); if (!ptr) { OPENVINO_THROW("Failed to allocate ", size, " bytes of memory"); } @@ -236,15 +240,17 @@ void MemoryBlockWithReuse::free() { m_useExternalStorage = false; } -void MemoryBlockWithReuse::release(void *ptr) {} +void MemoryBlockWithReuse::release(void* ptr) {} -void MemoryBlockWithReuse::destroy(void *ptr) { +void MemoryBlockWithReuse::destroy(void* ptr) { dnnl::impl::free(ptr); } /////////////// StringMemory /////////////// -StringMemory::StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc, const void* data) : m_engine(engine), m_mem_desc(desc) { +StringMemory::StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc, const void* data) + : m_engine(engine), + m_mem_desc(desc) { if (m_mem_desc->getPrecision() != element::string) { OPENVINO_THROW("[CPU] StringMemory supports String type only."); } @@ -258,8 +264,8 @@ StringMemory::StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc const auto string_size = m_mem_desc->getShape().getElementsCount(); if (data != nullptr) { - auto not_const_data = const_cast(data); - m_memoryBlock->setExtBuff(reinterpret_cast(not_const_data), string_size); + auto not_const_data = const_cast(data); + m_memoryBlock->setExtBuff(reinterpret_cast(not_const_data), string_size); } else { m_memoryBlock->resize(string_size); } @@ -273,7 +279,7 @@ void StringMemory::load(const IMemory& src, bool ftz) const { transferData(src, *this, false); } -void* StringMemory::getData() const { +void* StringMemory::getData() const { return m_memoryBlock->getRawPtr(); } @@ -297,7 +303,7 @@ void StringMemory::nullify() { } } -size_t StringMemory::getSize() const { // In bytes +size_t StringMemory::getSize() const { // In bytes auto size = getDesc().getCurrentMemSize(); if (size == MemoryDesc::UNDEFINED_SIZE) { OPENVINO_THROW("Can't get memory size for undefined shape."); @@ -329,7 +335,7 @@ bool StringMemory::StringMemoryBlock::resize(size_t size) { if (size > PTRDIFF_MAX) { OPENVINO_THROW("Requested allocation size { ", size, " } exceeds PTRDIFF_MAX."); } - auto ptr_size = static_cast(size); // WA for warning alloc-size-larger-than + auto ptr_size = static_cast(size); // WA for warning alloc-size-larger-than auto ptr = new OvString[ptr_size]; if (!ptr) { OPENVINO_THROW("Failed to allocate ", size, " bytes of memory"); @@ -355,7 +361,7 @@ void StringMemory::StringMemoryBlock::destroy(OvString* ptr) { } void* StringMemory::StringMemoryBlock::getRawPtr() const noexcept { - return reinterpret_cast(m_data.get()); + return reinterpret_cast(m_data.get()); } /////////////// DnnlMemoryBlock /////////////// @@ -364,7 +370,7 @@ void* DnnlMemoryBlock::getRawPtr() const noexcept { return m_pMemBlock->getRawPtr(); } -void DnnlMemoryBlock::setExtBuff(void *ptr, size_t size) { +void DnnlMemoryBlock::setExtBuff(void* ptr, size_t size) { m_pMemBlock->setExtBuff(ptr, size); notifyUpdate(); } @@ -401,8 +407,9 @@ void DnnlMemoryBlock::notifyUpdate() { } } -StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) : - m_eng(eng), m_pMemDesc(desc) { +StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) + : m_eng(eng), + m_pMemDesc(desc) { if (desc->getPrecision() == element::string) { OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data."); } @@ -427,14 +434,13 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo // // ======================== m_prim.set_data_handle(m_pMemBlock->getRawPtr()); - } - catch (const std::exception& exc) { + } catch (const std::exception& exc) { dnnlErrorCtx = exc.what(); } } -StaticMemory::StaticMemory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) : - StaticMemory::StaticMemory(eng, desc.clone(), data, pads_zeroing) {} +StaticMemory::StaticMemory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data, bool pads_zeroing) + : StaticMemory::StaticMemory(eng, desc.clone(), data, pads_zeroing) {} const MemoryDesc& StaticMemory::getDesc() const { return *m_pMemDesc; @@ -475,7 +481,7 @@ MemoryBlockPtr StaticMemory::getMemoryBlock() const { return m_pMemBlock; } -//oneDNN specifics for backward compatibility +// oneDNN specifics for backward compatibility dnnl::memory StaticMemory::getPrimitive() const { if (!m_prim) { OPENVINO_THROW("Couldn't create dnnl::memory object: ", dnnlErrorCtx); @@ -517,11 +523,11 @@ bool StaticMemory::StaticMemoryBlock::hasExtBuffer() const noexcept { } void StaticMemory::StaticMemoryBlock::registerMemory(Memory* memPtr) { - //do nothing + // do nothing } void StaticMemory::StaticMemoryBlock::unregisterMemory(Memory* memPtr) { - //do nothing + // do nothing } #if defined(__linux__) @@ -529,9 +535,9 @@ void StaticMemory::StaticMemoryBlock::unregisterMemory(Memory* memPtr) { # define MPOL_BIND 2 # define MPOL_MF_STRICT (1 << 0) # define MPOL_MF_MOVE (1 << 1) -#if !defined(__NR_mbind) && defined(__x86_64__) -# define __NR_mbind 237 -#endif +# if !defined(__NR_mbind) && defined(__x86_64__) +# define __NR_mbind 237 +# endif static long mbind(void* start, unsigned long len, int mode, @@ -585,7 +591,12 @@ bool mbind_move(const dnnl::memory mem, int numaNodeID) { return mbind_move(data, size, numaNodeID); } -MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill) { +MemoryPtr split_horizontal(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill) { auto desc = src->getDescPtr(); auto shape = src->getShape(); auto dims = shape.getDims(); @@ -620,7 +631,9 @@ MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim // reference stride VectorDims stride_dims = dims; stride_dims[dim] = splited_dim_vec[0]; - size_t stride = std::accumulate(stride_dims.begin(), stride_dims.end(), static_cast(1), std::multiplies()) * prec.size(); + size_t stride = + std::accumulate(stride_dims.begin(), stride_dims.end(), static_cast(1), std::multiplies()) * + prec.size(); // create new shape for target memory VectorDims new_dims = dims; @@ -641,7 +654,12 @@ MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim return ptr; } -MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill) { +MemoryPtr split_vertical(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill) { auto desc = src->getDescPtr(); auto shape = src->getShape(); auto dims = shape.getDims(); @@ -697,7 +715,7 @@ MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, strideSize /= 2; copySize /= 2; } - parallel_for(step, [&](int i){ + parallel_for(step, [&](int i) { int dst_offset = i * copySize; int src_offset = i * splited_size + w_rank * strideSize; cpu_parallel_memcpy(dstPtr + dst_offset, srcPtr + src_offset, copySize); @@ -705,5 +723,5 @@ MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, return ptr; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_memory.h b/src/plugins/intel_cpu/src/cpu_memory.h index 70e6713e36b886..f6837064babfa6 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.h +++ b/src/plugins/intel_cpu/src/cpu_memory.h @@ -4,18 +4,18 @@ #pragma once -#include "memory_desc/cpu_memory_desc.h" -#include "dnnl_extension_utils.h" -#include #include - -#include "openvino/core/type/element_type.hpp" -#include "openvino/core/type/element_type_traits.hpp" +#include #include #include #include +#include "dnnl_extension_utils.h" +#include "memory_desc/cpu_memory_desc.h" +#include "openvino/core/type/element_type.hpp" +#include "openvino/core/type/element_type_traits.hpp" + /** * @file contains a concept classes to work with memory/tensor/blob abstractions on plugin level. * @@ -47,7 +47,8 @@ class IMemoryBlock { virtual void* getRawPtr() const noexcept = 0; /** - * @brief Allows to set externally allocated memory buffer. In that case, the object has no control over the provided memory. + * @brief Allows to set externally allocated memory buffer. In that case, the object has no control over the + * provided memory. * @param ptr - pointer to the memory * @param size - size of the memory buffer */ @@ -82,11 +83,11 @@ class MemoryBlockWithReuse : public IMemoryBlock { private: bool m_useExternalStorage = false; size_t m_memUpperBound = 0ul; - std::unique_ptr m_data; + std::unique_ptr m_data; int numa_node; - static void release(void *ptr); - static void destroy(void *ptr); + static void release(void* ptr); + static void destroy(void* ptr); }; class IMemoryBlockObserver : public IMemoryBlock { @@ -128,13 +129,13 @@ class DnnlMemBlockHandle { } DnnlMemBlockHandle(const DnnlMemBlockHandle&) = delete; - DnnlMemBlockHandle& operator= (const DnnlMemBlockHandle&) = delete; + DnnlMemBlockHandle& operator=(const DnnlMemBlockHandle&) = delete; DnnlMemBlockHandle(DnnlMemBlockHandle&& source) { std::swap(m_pMemBlock, source.m_pMemBlock); std::swap(m_pMem, source.m_pMem); } - DnnlMemBlockHandle& operator= (DnnlMemBlockHandle&& rhs) { + DnnlMemBlockHandle& operator=(DnnlMemBlockHandle&& rhs) { std::swap(m_pMemBlock, rhs.m_pMemBlock); std::swap(m_pMem, rhs.m_pMem); return *this; @@ -166,7 +167,7 @@ class IMemory { virtual const MemoryDesc& getDesc() const = 0; virtual MemoryDescPtr getDescPtr() const = 0; - virtual void* getData() const = 0; // pointer to the actual memory + virtual void* getData() const = 0; // pointer to the actual memory template ::type> T* getDataAs() const { @@ -177,7 +178,7 @@ class IMemory { return static_cast(getData()); } - virtual size_t getSize() const = 0; // in bytes + virtual size_t getSize() const = 0; // in bytes virtual const Shape& getShape() const = 0; virtual const VectorDims& getStaticDims() const = 0; @@ -199,7 +200,7 @@ class IMemory { return false; } - //oneDNN specifics for backward compatibility + // oneDNN specifics for backward compatibility virtual dnnl::memory getPrimitive() const = 0; ov::element::Type getPrecision() const { @@ -211,8 +212,8 @@ class IMemory { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> std::shared_ptr getDescWithType() const; }; @@ -241,17 +242,17 @@ class StaticMemory final : public IMemory { StaticMemory(const dnnl::engine& eng, const MemoryDesc& desc, const void* data = nullptr, bool pads_zeroing = true); StaticMemory(const StaticMemory&) = delete; - StaticMemory& operator= (const StaticMemory&) = delete; + StaticMemory& operator=(const StaticMemory&) = delete; StaticMemory(Memory&&) = delete; - StaticMemory& operator= (StaticMemory&&) = delete; + StaticMemory& operator=(StaticMemory&&) = delete; const MemoryDesc& getDesc() const override; MemoryDescPtr getDescPtr() const override; - void* getData() const override; // pointer to the actual memory + void* getData() const override; // pointer to the actual memory - size_t getSize() const override; // in bytes + size_t getSize() const override; // in bytes const Shape& getShape() const override; const VectorDims& getStaticDims() const override; @@ -262,7 +263,7 @@ class StaticMemory final : public IMemory { MemoryBlockPtr getMemoryBlock() const override; - //oneDNN specifics for backward compatibility + // oneDNN specifics for backward compatibility dnnl::memory getPrimitive() const override; void nullify() override; @@ -284,10 +285,10 @@ class Memory : public IMemory { Memory(const dnnl::engine& eng, const MemoryDesc& desc, MemoryBlockPtr block); Memory(const Memory&) = delete; - Memory& operator= (const Memory&) = delete; + Memory& operator=(const Memory&) = delete; Memory(Memory&&) = delete; - Memory& operator= (Memory&&) = delete; + Memory& operator=(Memory&&) = delete; dnnl::memory getPrimitive() const override; @@ -341,7 +342,7 @@ class Memory : public IMemory { bool m_padsZeroing = true; class DnnlMemPrimHandle { public: - explicit DnnlMemPrimHandle(const Memory* memObjPtr): m_memObjPtr(memObjPtr) {} + explicit DnnlMemPrimHandle(const Memory* memObjPtr) : m_memObjPtr(memObjPtr) {} bool isInit() const; dnnl::memory getPrim() const; void resetDnnlPrim(); @@ -376,7 +377,7 @@ class StringMemory : public IMemory { private: bool m_use_external_storage = false; size_t m_str_upper_bound = 0lu; - std::unique_ptr m_data; + std::unique_ptr m_data; static void release(OvString* ptr) {} static void destroy(OvString* ptr); @@ -390,7 +391,9 @@ class StringMemory : public IMemory { : StringMemory(engine, desc.clone(), data) {} StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc, const StringMemoryBlockPtr& block) - : m_engine(engine), m_mem_desc(desc), m_memoryBlock(block) {} + : m_engine(engine), + m_mem_desc(desc), + m_memoryBlock(block) {} StringMemory(const dnnl::engine& engine, const MemoryDesc& desc, const StringMemoryBlockPtr& block) : StringMemory(engine, desc.clone(), block) {} @@ -405,7 +408,7 @@ class StringMemory : public IMemory { void* getData() const override; - size_t getSize() const override; // In bytes + size_t getSize() const override; // In bytes const Shape& getShape() const override { return m_mem_desc->getShape(); @@ -443,8 +446,18 @@ bool mbind_move(void* data, size_t size, int numaNodeID); bool mbind_move(const MemoryCPtr mem, int numaNodeID); bool mbind_move(const dnnl::memory mem, int numaNodeID); -MemoryPtr split_horizontal(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill = true); -MemoryPtr split_vertical(const dnnl::engine& eng, const MemoryPtr src, int dim, int w_rank, int w_size, bool need_fill = true); - -} // namespace intel_cpu -} // namespace ov +MemoryPtr split_horizontal(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill = true); +MemoryPtr split_vertical(const dnnl::engine& eng, + const MemoryPtr src, + int dim, + int w_rank, + int w_size, + bool need_fill = true); + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_shape.cpp b/src/plugins/intel_cpu/src/cpu_shape.cpp index 4c6b5793d9f2ef..2b7011af1a1f5e 100644 --- a/src/plugins/intel_cpu/src/cpu_shape.cpp +++ b/src/plugins/intel_cpu/src/cpu_shape.cpp @@ -3,12 +3,13 @@ // #include "cpu_shape.h" + #include "utils/general_utils.h" namespace ov { namespace intel_cpu { -bool Shape::isCompatible(const VectorDims &vecDims) const { +bool Shape::isCompatible(const VectorDims& vecDims) const { if (getRank() != vecDims.size()) { return false; } @@ -21,17 +22,21 @@ bool Shape::isCompatible(const VectorDims &vecDims) const { return false; } - if (!std::equal(getMaxDims().begin(), getMaxDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { return lhs >= rhs; })) { + if (!std::equal(getMaxDims().begin(), getMaxDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { + return lhs >= rhs; + })) { return false; } - if (!std::equal(getMinDims().begin(), getMinDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { return lhs <= rhs; })) { + if (!std::equal(getMinDims().begin(), getMinDims().end(), vecDims.begin(), [](Dim lhs, Dim rhs) { + return lhs <= rhs; + })) { return false; } return true; } -std::string Shape::toString() const { +std::string Shape::toString() const { std::stringstream output; output << "{"; @@ -50,10 +55,10 @@ std::string Shape::toString() const { Shape mergeShapes(const Shape& lhs, const Shape& rhs) { OPENVINO_ASSERT(lhs.getRank() == rhs.getRank(), - "Couldn't merge shapes of different ranks: shape 1:", - lhs.toString(), - " shape 2: ", - rhs.toString()); + "Couldn't merge shapes of different ranks: shape 1:", + lhs.toString(), + " shape 2: ", + rhs.toString()); const auto& lhsMinDims = lhs.getMinDims(); const auto& lhsMaxDims = lhs.getMaxDims(); @@ -66,10 +71,11 @@ Shape mergeShapes(const Shape& lhs, const Shape& rhs) { for (size_t i = 0; i < resultMinDims.size(); ++i) { resultMinDims[i] = std::max(lhsMinDims[i], rhsMinDims[i]); resultMaxDims[i] = std::min(lhsMaxDims[i], rhsMaxDims[i]); - OPENVINO_ASSERT(resultMinDims[i] <= resultMaxDims[i], "Couldn't merge shapes as the dims intervals are not overlapping."); + OPENVINO_ASSERT(resultMinDims[i] <= resultMaxDims[i], + "Couldn't merge shapes as the dims intervals are not overlapping."); } return Shape{resultMinDims, resultMaxDims}; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_shape.h b/src/plugins/intel_cpu/src/cpu_shape.h index a04b043689e520..f2895287e2f8fe 100644 --- a/src/plugins/intel_cpu/src/cpu_shape.h +++ b/src/plugins/intel_cpu/src/cpu_shape.h @@ -31,13 +31,17 @@ class Shape { type = shape.is_static() ? ShapeType::Static : ShapeType::Dynamic; initDims(); - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } explicit Shape(const VectorDims& shape) { dims = minDims = maxDims = shape; type = ShapeType::Static; - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } Shape(const VectorDims& minDims, const VectorDims& maxDims) { @@ -49,13 +53,17 @@ class Shape { initDims(); - if (std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == Shape::UNDEFINED_DIM; } )) { + if (std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == Shape::UNDEFINED_DIM; + })) { type = ShapeType::Dynamic; } else { type = ShapeType::Static; } - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } Shape(const std::initializer_list& shape) { @@ -69,7 +77,9 @@ class Shape { initDims(); - hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ); + hasZeroDimensions = std::any_of(dims.begin(), dims.end(), [](size_t dim) { + return dim == 0; + }); } /** @@ -181,21 +191,21 @@ class Shape { std::string toString() const; - bool operator == (const Shape& rhs) const { + bool operator==(const Shape& rhs) const { return minDims == rhs.minDims && maxDims == rhs.maxDims; } - bool operator != (const Shape& rhs) const { + bool operator!=(const Shape& rhs) const { return !(*this == rhs); } bool hasDefinedUpperBounds() const { - return std::all_of(maxDims.begin(), maxDims.end(), [](Dim dim){ return dim != UNDEFINED_DIM; }); + return std::all_of(maxDims.begin(), maxDims.end(), [](Dim dim) { + return dim != UNDEFINED_DIM; + }); } - enum : Dim { - UNDEFINED_DIM = std::numeric_limits::max() - }; + enum : Dim { UNDEFINED_DIM = std::numeric_limits::max() }; private: void initDims() { @@ -205,10 +215,7 @@ class Shape { } } - enum class ShapeType { - Static, - Dynamic - } type {ShapeType::Static}; + enum class ShapeType { Static, Dynamic } type{ShapeType::Static}; bool hasZeroDimensions = false; @@ -229,5 +236,5 @@ class Shape { Shape mergeShapes(const Shape& lhs, const Shape& rhs); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp index 0ed64d49ea68dd..be6f5c4035d1ee 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp @@ -4,6 +4,11 @@ #include "cpu_streams_calculation.hpp" +#include +#include +#include +#include + #include "cpu_map_scheduling.hpp" #include "graph.h" #include "openvino/op/fake_quantize.hpp" @@ -13,29 +18,25 @@ #include "transformations/utils.hpp" #include "transformations/utils/utils.hpp" -#include -#include -#include -#include - using namespace ov; using namespace ov::threading; -#define INIT_VAL -100 +#define INIT_VAL -100 #define TP_CPU_LIMIT 32 namespace ov { namespace intel_cpu { -std::vector> get_streams_info_table(const int input_streams, - const bool input_streams_changed, - const int input_threads, - const int input_infer_requests, - const int model_prefer_threads, - const int input_current_socket_id, - const std::string input_perf_hint, - const std::set hint_model_distribution_policy, - const std::vector>& proc_type_table) { +std::vector> get_streams_info_table( + const int input_streams, + const bool input_streams_changed, + const int input_threads, + const int input_infer_requests, + const int model_prefer_threads, + const int input_current_socket_id, + const std::string input_perf_hint, + const std::set hint_model_distribution_policy, + const std::vector>& proc_type_table) { std::vector stream_info(CPU_STREAMS_TABLE_SIZE, INIT_VAL); std::vector> streams_info_table; std::vector> proc_socket_table; @@ -242,26 +243,32 @@ std::vector> get_streams_info_table(const int input_streams, n_threads_per_stream = proc_type_table[0][ALL_PROC]; } } else { - int numa_index = 1; + size_t socket_index = 0; + for (socket_index = 0; socket_index < proc_socket_table.size(); socket_index++) { + if (proc_socket_table[socket_index][PROC_SOCKET_ID] == current_socket_id) { + break; + } + } + const std::vector& current_socket_info = proc_socket_table[socket_index]; n_threads_per_stream = model_prefer_threads == 0 - ? proc_type_table[numa_index][ALL_PROC] - : std::min(proc_type_table[numa_index][ALL_PROC], model_prefer_threads); + ? current_socket_info[ALL_PROC] + : std::min(current_socket_info[ALL_PROC], model_prefer_threads); stream_info[THREADS_PER_STREAM] = n_threads_per_stream; - if (proc_type_table[numa_index][ALL_PROC] == proc_type_table[numa_index][MAIN_CORE_PROC]) { + if (current_socket_info[ALL_PROC] == current_socket_info[MAIN_CORE_PROC]) { stream_info[PROC_TYPE] = MAIN_CORE_PROC; - update_streams_per_node(MAIN_CORE_PROC, proc_type_table[numa_index]); - } else if (proc_type_table[numa_index][ALL_PROC] == proc_type_table[numa_index][EFFICIENT_CORE_PROC]) { + update_streams_per_node(MAIN_CORE_PROC, current_socket_info); + } else if (current_socket_info[ALL_PROC] == current_socket_info[EFFICIENT_CORE_PROC]) { stream_info[PROC_TYPE] = EFFICIENT_CORE_PROC; - update_streams_per_node(EFFICIENT_CORE_PROC, proc_type_table[numa_index]); + update_streams_per_node(EFFICIENT_CORE_PROC, current_socket_info); } else { stream_info[PROC_TYPE] = ALL_PROC; - update_mix_stream_info(proc_type_table[numa_index], - {proc_type_table[numa_index]}, + update_mix_stream_info(current_socket_info, + proc_type_table, n_threads_per_stream, IStreamsExecutor::Config::StreamsMode::SUB_STREAMS_NULL, ALL_PROC); } - update_ids_method(proc_type_table[numa_index]); + update_ids_method(current_socket_info); } } else { n_threads = @@ -333,8 +340,7 @@ std::vector> get_streams_info_table(const int input_streams, n_threads_per_stream = static_cast(n_threads / n_streams); check_threads_per_stream(); } else { - n_threads_per_stream = - model_threads > 0 ? model_threads : static_cast(n_threads / n_streams); + n_threads_per_stream = model_threads > 0 ? model_threads : static_cast(n_threads / n_streams); } } } @@ -584,7 +590,7 @@ int get_model_prefer_threads(const int num_streams, (networkToleranceForLowCache.ratio_mem_limited_gemms > ov::MemBandwidthPressure::LIMITED))) { config.modelPreferThreads = 8; } -#elif((defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)) +#elif ((defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)) && defined(__APPLE__)) config.modelPreferThreads = 1; if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) || diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp index e362c0373d8d1d..0a0b4a1449b7cb 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp @@ -44,15 +44,16 @@ namespace intel_cpu { * in previous function. * @return streams information table which will be used by StreamsExecutor. */ -std::vector> get_streams_info_table(const int input_streams, - const bool input_streams_changed, - const int input_threads, - const int input_infer_requests, - const int model_prefer_threads, - const int input_current_socket_id, - const std::string input_perf_hint, - const std::set hint_llm_distribution_policy, - const std::vector>& proc_type_table); +std::vector> get_streams_info_table( + const int input_streams, + const bool input_streams_changed, + const int input_threads, + const int input_infer_requests, + const int model_prefer_threads, + const int input_current_socket_id, + const std::string input_perf_hint, + const std::set hint_llm_distribution_policy, + const std::vector>& proc_type_table); /** * @brief Generate streams rank table for tensor parallel according to streams info table. @@ -106,9 +107,7 @@ std::vector> generate_stream_info(const int streams, * @param[in] model graph handle * @param[in] config intel cpu configuration */ -void get_num_streams(const int streams, - const std::shared_ptr& model, - Config& config); +void get_num_streams(const int streams, const std::shared_ptr& model, Config& config); } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_tensor.cpp b/src/plugins/intel_cpu/src/cpu_tensor.cpp index 1a045ca117a538..0f82a8a9a4dfec 100644 --- a/src/plugins/intel_cpu/src/cpu_tensor.cpp +++ b/src/plugins/intel_cpu/src/cpu_tensor.cpp @@ -16,7 +16,8 @@ Tensor::Tensor(MemoryPtr memptr) : m_memptr{memptr} { // only support plain data format ncsp. auto memdesc = m_memptr->getDescPtr(); - OPENVINO_ASSERT(memdesc->hasLayoutType(LayoutType::ncsp), "intel_cpu::Tensor only supports memory with ncsp layout."); + OPENVINO_ASSERT(memdesc->hasLayoutType(LayoutType::ncsp), + "intel_cpu::Tensor only supports memory with ncsp layout."); m_element_type = memdesc->getPrecision(); } @@ -24,8 +25,14 @@ Tensor::Tensor(MemoryPtr memptr) : m_memptr{memptr} { void Tensor::set_shape(ov::Shape new_shape) { const auto& shape = m_memptr->getDescPtr()->getShape(); if (shape.isStatic()) { - DEBUG_LOG("tensor's memory object ", m_memptr.get(), ", ", vec2str(shape.getStaticDims()), " -> ", new_shape.to_string()); - if (shape.getStaticDims() == new_shape) return; + DEBUG_LOG("tensor's memory object ", + m_memptr.get(), + ", ", + vec2str(shape.getStaticDims()), + " -> ", + new_shape.to_string()); + if (shape.getStaticDims() == new_shape) + return; } auto desc = m_memptr->getDescPtr(); @@ -69,7 +76,7 @@ void Tensor::update_strides() const { OPENVINO_ASSERT(blocked_desc, "not a valid blocked memory descriptor."); auto& strides = blocked_desc->getStrides(); m_strides.resize(strides.size()); - std::transform(strides.cbegin(), strides.cend(), m_strides.begin(), [this] (const size_t stride) { + std::transform(strides.cbegin(), strides.cend(), m_strides.begin(), [this](const size_t stride) { return stride * m_element_type.size(); }); } @@ -96,5 +103,5 @@ std::shared_ptr make_tensor(MemoryPtr mem) { return std::make_shared(mem); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_tensor.h b/src/plugins/intel_cpu/src/cpu_tensor.h index 0f073e0d298faf..86648ce969b168 100644 --- a/src/plugins/intel_cpu/src/cpu_tensor.h +++ b/src/plugins/intel_cpu/src/cpu_tensor.h @@ -4,8 +4,8 @@ #pragma once -#include "openvino/runtime/itensor.hpp" #include "cpu_memory.h" +#include "openvino/runtime/itensor.hpp" namespace ov { namespace intel_cpu { @@ -29,7 +29,9 @@ class Tensor : public ITensor { void* data(const element::Type& type = {}) const override; - MemoryPtr get_memory() {return m_memptr;} + MemoryPtr get_memory() { + return m_memptr; + } private: void update_strides() const; @@ -44,5 +46,5 @@ class Tensor : public ITensor { std::shared_ptr make_tensor(MemoryPtr mem); -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 3b6440e56c3272..67c538bd78341a 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -2,10 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // #include "cpu_types.h" -#include "cpu_shape.h" -#include #include +#include + +#include "cpu_shape.h" namespace ov { namespace intel_cpu { @@ -41,6 +42,9 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"GroupConvolution", Type::Convolution}, {"MatMul", Type::MatMul}, {"FullyConnected", Type::FullyConnected}, + {"FullyConnectedCompressed", Type::FullyConnected}, + {"FullyConnectedQuantizedLegacy", Type::FullyConnected}, + {"FullyConnectedQuantized", Type::FullyConnected}, {"MaxPool", Type::Pooling}, {"AvgPool", Type::Pooling}, {"AdaptiveMaxPool", Type::AdaptivePooling}, @@ -257,8 +261,7 @@ static const TypeToNameMap& get_type_to_name_tbl() { {"QKVProjection", Type::QKVProjection}, {"RMS", Type::RMS}, {"SearchSorted", Type::SearchSorted}, - {"LoraSubgraph", Type::LoRA} - }; + {"LoraSubgraph", Type::LoRA}}; return type_to_name_tbl; } @@ -469,6 +472,10 @@ std::string algToString(const Algorithm alg) { CASE(FQCommon); CASE(FQQuantization); CASE(FQBinarization); + CASE(FullyConnectedCommon); + CASE(FullyConnectedCompressed); + CASE(FullyConnectedQuantized); + CASE(FullyConnectedQuantizedLegacy); CASE(ROIPoolingMax); CASE(ROIPoolingBilinear); CASE(ROIAlignMax); diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index 9461526184b0bf..71088c22af8336 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -213,6 +213,12 @@ enum class Algorithm { EltwiseBitwiseLeftShift, EltwiseBitwiseRightShift, + // FullyConnected algorithms + FullyConnectedCommon, + FullyConnectedCompressed, + FullyConnectedQuantized, + FullyConnectedQuantizedLegacy, + // FakeQuantize algorithms FQCommon, FQQuantization, diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp index 3d9b2f69bd8f66..457f8368f734dd 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.cpp @@ -47,79 +47,79 @@ uint8_t DnnlExtensionUtils::sizeOfDataType(dnnl::memory::data_type dataType) { dnnl::memory::data_type DnnlExtensionUtils::ElementTypeToDataType(const ov::element::Type& elementType) { switch (elementType) { - case ov::element::f32: - return memory::data_type::f32; - case ov::element::i32: - return memory::data_type::s32; - case ov::element::bf16: - return memory::data_type::bf16; - case ov::element::i8: - return memory::data_type::s8; - case ov::element::u8: - case ov::element::boolean: - return memory::data_type::u8; - case ov::element::u1: - return memory::data_type::bin; - case ov::element::f16: - return memory::data_type::f16; - case ov::element::nf4: - return memory::data_type::nf4; - case ov::element::i4: - return memory::data_type::s4; - case ov::element::u4: - return memory::data_type::u4; - case ov::element::f8e8m0: - return memory::data_type::f8_e8m0; - case ov::element::f4e2m1: - return memory::data_type::f4_e2m1; - case ov::element::undefined: - return memory::data_type::undef; - default: { - OPENVINO_THROW("CPU plugin does not support ", elementType.to_string(), " for use with oneDNN."); - } + case ov::element::f32: + return memory::data_type::f32; + case ov::element::i32: + return memory::data_type::s32; + case ov::element::bf16: + return memory::data_type::bf16; + case ov::element::i8: + return memory::data_type::s8; + case ov::element::u8: + case ov::element::boolean: + return memory::data_type::u8; + case ov::element::u1: + return memory::data_type::bin; + case ov::element::f16: + return memory::data_type::f16; + case ov::element::nf4: + return memory::data_type::nf4; + case ov::element::i4: + return memory::data_type::s4; + case ov::element::u4: + return memory::data_type::u4; + case ov::element::f8e8m0: + return memory::data_type::f8_e8m0; + case ov::element::f4e2m1: + return memory::data_type::f4_e2m1; + case ov::element::undefined: + return memory::data_type::undef; + default: { + OPENVINO_THROW("CPU plugin does not support ", elementType.to_string(), " for use with oneDNN."); + } } } ov::element::Type DnnlExtensionUtils::DataTypeToElementType(const dnnl::memory::data_type& dataType) { switch (dataType) { - case memory::data_type::f32: - return ov::element::f32; - case memory::data_type::s32: - return ov::element::i32; - case memory::data_type::bf16: - return ov::element::bf16; - case memory::data_type::s8: - return ov::element::i8; - case memory::data_type::u8: - return ov::element::u8; - case memory::data_type::bin: - return ov::element::u1; - case memory::data_type::f16: - return ov::element::f16; - case memory::data_type::f64: - return ov::element::f64; - case memory::data_type::nf4: - return ov::element::nf4; - case memory::data_type::s4: - return ov::element::i4; - case memory::data_type::u4: - return ov::element::u4; - case memory::data_type::f8_e8m0: - return ov::element::f8e8m0; - case memory::data_type::f4_e2m1: - return ov::element::f4e2m1; - case memory::data_type::undef: - return ov::element::undefined; - default: { - OPENVINO_THROW("Unsupported data type."); - } + case memory::data_type::f32: + return ov::element::f32; + case memory::data_type::s32: + return ov::element::i32; + case memory::data_type::bf16: + return ov::element::bf16; + case memory::data_type::s8: + return ov::element::i8; + case memory::data_type::u8: + return ov::element::u8; + case memory::data_type::bin: + return ov::element::u1; + case memory::data_type::f16: + return ov::element::f16; + case memory::data_type::f64: + return ov::element::f64; + case memory::data_type::nf4: + return ov::element::nf4; + case memory::data_type::s4: + return ov::element::i4; + case memory::data_type::u4: + return ov::element::u4; + case memory::data_type::f8_e8m0: + return ov::element::f8e8m0; + case memory::data_type::f4_e2m1: + return ov::element::f4e2m1; + case memory::data_type::undef: + return ov::element::undefined; + default: { + OPENVINO_THROW("Unsupported data type."); + } } } -Dim DnnlExtensionUtils::convertToDim(const dnnl::memory::dim &dim) { - return dim == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : static_cast(dim); +Dim DnnlExtensionUtils::convertToDim(const dnnl::memory::dim& dim) { + return dim == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : static_cast(dim); } -dnnl::memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim &dim) { +dnnl::memory::dim DnnlExtensionUtils::convertToDnnlDim(const Dim& dim) { return dim == Shape::UNDEFINED_DIM ? DNNL_RUNTIME_DIM_VAL : static_cast(dim); } @@ -141,25 +141,25 @@ memory::dims DnnlExtensionUtils::convertToDnnlDims(const VectorDims& dims) { memory::format_tag DnnlExtensionUtils::GetPlainFormatByRank(size_t rank) { switch (rank) { - case 0: - case 1: - return memory::format_tag::a; - case 2: - return memory::format_tag::ab; - case 3: - return memory::format_tag::abc; - case 4: - return memory::format_tag::abcd; - case 5: - return memory::format_tag::abcde; - case 6: - return memory::format_tag::abcdef; - default: - return memory::format_tag::undef; + case 0: + case 1: + return memory::format_tag::a; + case 2: + return memory::format_tag::ab; + case 3: + return memory::format_tag::abc; + case 4: + return memory::format_tag::abcd; + case 5: + return memory::format_tag::abcde; + case 6: + return memory::format_tag::abcdef; + default: + return memory::format_tag::undef; } } -DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc &desc) { +DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc& desc) { return makeDescriptor(desc.get()); } @@ -182,7 +182,8 @@ size_t DnnlExtensionUtils::getMemSizeForDnnlDesc(const dnnl::memory::desc& desc) return size; } -std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const memory::desc &desc, const Shape &shape) { +std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const memory::desc& desc, + const Shape& shape) { if (desc.get_format_kind() == memory::format_kind::blocked) { return std::shared_ptr(new DnnlBlockedMemoryDesc(desc, shape)); } else { @@ -190,7 +191,9 @@ std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(con } } -DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx) { +DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t& pd, + const dnnl::query& what, + int idx) { auto query = dnnl::convert_to_c(what); const auto* cdesc = dnnl_primitive_desc_query_md(pd, query, idx); @@ -201,7 +204,7 @@ DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t } std::string DnnlExtensionUtils::query_impl_info_str(const const_dnnl_primitive_desc_t& pd) { - const char *res; + const char* res; dnnl_status_t status = dnnl_primitive_desc_query(pd, dnnl_query_impl_info_str, 0, &res); if (status != dnnl_success) OPENVINO_THROW("query_impl_info_str failed."); @@ -209,10 +212,9 @@ std::string DnnlExtensionUtils::query_impl_info_str(const const_dnnl_primitive_d } bool DnnlExtensionUtils::find_implementation(dnnl::primitive_desc& desc, impl_desc_type impl_type) { - return DnnlExtensionUtils::find_implementation(desc, - [impl_type](impl_desc_type cur_impl_type){ - return cur_impl_type == impl_type; - }); + return DnnlExtensionUtils::find_implementation(desc, [impl_type](impl_desc_type cur_impl_type) { + return cur_impl_type == impl_type; + }); } dnnl_memory_desc_t DnnlExtensionUtils::clone_desc(const_dnnl_memory_desc_t cdesc) { @@ -233,31 +235,33 @@ const char* DnnlExtensionUtils::query_pd_info(const_dnnl_primitive_desc_t pd) { bool DnnlExtensionUtils::isUnarySupportedAsPostOp(Algorithm alg) { #if defined(OV_CPU_WITH_ACL) - return one_of(alg, Algorithm::EltwiseRelu, - Algorithm::EltwiseTanh, - Algorithm::EltwiseElu, - Algorithm::EltwiseAbs, - Algorithm::EltwiseSqrt, - Algorithm::EltwiseSoftRelu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp); + return one_of(alg, + Algorithm::EltwiseRelu, + Algorithm::EltwiseTanh, + Algorithm::EltwiseElu, + Algorithm::EltwiseAbs, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseSoftRelu, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseClamp); #elif defined(OPENVINO_ARCH_X86_64) - return one_of(alg, Algorithm::EltwiseRelu, - Algorithm::EltwiseGeluErf, - Algorithm::EltwiseGeluTanh, - Algorithm::EltwiseElu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp, - Algorithm::EltwiseTanh, - Algorithm::EltwiseSwish, - Algorithm::EltwiseHswish, - Algorithm::EltwiseMish, - Algorithm::EltwiseHsigmoid, - Algorithm::EltwiseRoundHalfToEven, - Algorithm::EltwiseRoundHalfAwayFromZero, - Algorithm::EltwiseAbs, - Algorithm::EltwiseSqrt, - Algorithm::EltwiseSoftRelu); + return one_of(alg, + Algorithm::EltwiseRelu, + Algorithm::EltwiseGeluErf, + Algorithm::EltwiseGeluTanh, + Algorithm::EltwiseElu, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseClamp, + Algorithm::EltwiseTanh, + Algorithm::EltwiseSwish, + Algorithm::EltwiseHswish, + Algorithm::EltwiseMish, + Algorithm::EltwiseHsigmoid, + Algorithm::EltwiseRoundHalfToEven, + Algorithm::EltwiseRoundHalfAwayFromZero, + Algorithm::EltwiseAbs, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseSoftRelu); #else return false; #endif @@ -269,5 +273,5 @@ std::string DnnlExtensionUtils::computeWeightsStringHash(const std::shared_ptr(memory->getData())); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/dnnl_extension_utils.h b/src/plugins/intel_cpu/src/dnnl_extension_utils.h index 7a968ea3c71c3d..ecf223b48497cd 100644 --- a/src/plugins/intel_cpu/src/dnnl_extension_utils.h +++ b/src/plugins/intel_cpu/src/dnnl_extension_utils.h @@ -10,11 +10,11 @@ #include +#include "common/c_types_map.hpp" #include "cpu_types.h" #include "onednn/dnnl.h" #include "onednn/iml_type_mapper.h" #include "openvino/core/type/element_type.hpp" -#include "common/c_types_map.hpp" namespace ov { namespace intel_cpu { @@ -29,8 +29,8 @@ class DnnlExtensionUtils { static uint8_t sizeOfDataType(dnnl::memory::data_type dataType); static dnnl::memory::data_type ElementTypeToDataType(const ov::element::Type& elementType); static ov::element::Type DataTypeToElementType(const dnnl::memory::data_type& dataType); - static Dim convertToDim(const dnnl::memory::dim &dim); - static dnnl::memory::dim convertToDnnlDim(const Dim &dim); + static Dim convertToDim(const dnnl::memory::dim& dim); + static dnnl::memory::dim convertToDnnlDim(const Dim& dim); static VectorDims convertToVectorDims(const dnnl::memory::dims& dims); static VectorDims convertToVectorDims(const dnnl::impl::dims_t dims, const int ndims); static std::vector convertToDnnlDims(const VectorDims& dims); @@ -41,25 +41,28 @@ class DnnlExtensionUtils { * @param desc dnnl::memory::desc from which one of the descriptors will be created * @return pointer to DnnlBlockedMemoryDesc or DnnlMemoryDesc */ - static std::shared_ptr makeDescriptor(const dnnl::memory::desc &desc); + static std::shared_ptr makeDescriptor(const dnnl::memory::desc& desc); static std::shared_ptr makeDescriptor(const_dnnl_memory_desc_t desc); /** * @brief Helper function that creates DnnlBlockedMemoryDesc from defined dnnl::memory::desc and undefined shape. - * It uses desc as an basis for the new undefined one. Specifically, type, layout, precision, blocks, extra data will be preserved. + * It uses desc as an basis for the new undefined one. Specifically, type, layout, precision, blocks, extra data + * will be preserved. * @param desc dnnl::memory::desc dnnl desc which will be used as a basis of the new descriptor * @param shape a new undefined shape * @return pointer to the created DnnlBlockedMemoryDesc * @note Obly blocked descriptors are allowed at the moment */ - static std::shared_ptr makeUndefinedDesc(const dnnl::memory::desc &desc, const Shape& shape); + static std::shared_ptr makeUndefinedDesc(const dnnl::memory::desc& desc, const Shape& shape); static size_t getMemSizeForDnnlDesc(const dnnl::memory::desc& desc); - static std::shared_ptr query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx = 0); + static std::shared_ptr query_md(const const_dnnl_primitive_desc_t& pd, + const dnnl::query& what, + int idx = 0); static std::string query_impl_info_str(const const_dnnl_primitive_desc_t& pd); - template + template static bool find_implementation(dnnl::primitive_desc& desc, T&& comparator) { dnnl::primitive_desc_iterator& itpd = desc; @@ -77,7 +80,7 @@ class DnnlExtensionUtils { return false; } - template + template static void for_each_implementation(dnnl::primitive_desc& desc, bool first_match, T&& comparator, L&& func) { dnnl::primitive_desc_iterator& itpd = desc; @@ -113,5 +116,5 @@ class DnnlExtensionUtils { const std::shared_ptr& dstDesc); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp index 2f82fbe553ae19..7d62e5cb6b673d 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp @@ -11,21 +11,69 @@ #include #include +#include "cpu_types.h" #include "memory_desc/dnnl_blocked_memory_desc.h" +#include "nodes/executors/memory_arguments.hpp" #include "openvino/core/type/element_type.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { +static std::vector getDeQuantizedScales(const MemoryArgs& memory) { + if (!memory.count(ARG_DST_DEQ_SCALE)) + return {}; + + auto scalesMemory = memory.at(ARG_DST_DEQ_SCALE); + + auto scalesData = static_cast(scalesMemory->getData()); + + if (!scalesData) + return {}; + + auto dstShape = memory.at(ARG_DST)->getShape(); + auto dqScalesShape = scalesMemory->getShape(); + + auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size()); + + auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies()); + + std::vector DQScales(scaleSize, 1.0); + + OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize, + "set invalid scales size , DQScales vector size: ", + DQScales.size(), + ", scale data size: ", + scaleSize); + + // @todo do we really need to broadcast dq scales and then resize them back? + if (scaleSize > DQScales.size()) + DQScales.resize(scaleSize, DQScales[0]); + if (1 == scaleSize) { + std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) { + return (scalesData[0] * val); + }); + } else { + for (size_t i = 0; i < DQScales.size(); i++) { + DQScales[i] *= scalesData[i]; + } + } + if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) { + return (val == DQScales[0]); + })) + DQScales.resize(1); + + return DQScales; +} + DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, const dnnl::engine& engine, const VectorDims& outputDims, const size_t indexOfOutputChannelDim, const bool isInt8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType) : engine(engine), postOps(postOps), @@ -39,6 +87,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, dimsPerOC = dimsPerTensor = VectorDims(outputDims.size(), 1); dimsPerOC[idxOC] = OC; + const auto& DQScales = getDeQuantizedScales(memory); // generalise dq scales, so extra logic is necessary here. if (isINT8) { wei_scale_values = DQScales.empty() ? std::vector{1.0} : DQScales; @@ -49,6 +98,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, updateWeiScales(); // If having the bias, attr weight scale can't be updated for further ops-ops optimization. // ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. + const bool hasBias = !memory.at(ARG_BIAS)->getDesc().empty(); weightScaleAvailable = !hasBias; } else if (!DQScales.empty()) { // DQ scale is fused but swiching back to non-INT8 for execution in some cases. @@ -325,9 +375,9 @@ static OptimizedFormula updateOptimizedFormula(const FakeQuantizePostOp& postOp, } bool DnnlPostOpsComposer::appendAttrPostOps(const FakeQuantizePostOp& postOp, - bool isLastPostOp, - bool doRounding, - bool allowBinary) { + bool isLastPostOp, + bool doRounding, + bool allowBinary) { DEBUG_LOG("isLastPostOp=", isLastPostOp, ", outDataType=", @@ -541,9 +591,9 @@ bool DnnlPostOpsComposer::appendShift(const std::vector& shift, bool allo } bool DnnlPostOpsComposer::appendLinear(const std::vector& scale, - const std::vector& shift, - bool isLastPostOp, - bool allowBinary) { + const std::vector& shift, + bool isLastPostOp, + bool allowBinary) { if (scale.size() == 1 && shift.size() == 1) { if (shift[0] == 0.0f) return appendScale(scale, isLastPostOp, allowBinary); @@ -599,15 +649,27 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, if (shape.size() == 1 && shape[0] == 1) { shape.push_back(1); } + if (shape.size() != 2 && shape.size() != 3) - OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); + OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); - Shape dstShape = needTranspose ? Shape({shape[0], shape[1]}) : Shape({shape[shape.size() - 1], shape[0]}); - DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(dstPrc), dnnl::memory::format_tag::io); - auto dstMem = std::make_shared(engine, dstMemoryDesc); + // weights without batch: (OC, G) + // weights with batch: (B, OC, G) + const size_t OC = shape[shape.size() - 2]; + const size_t G = shape[shape.size() - 1]; + + Shape dstShape = Shape({OC, G}); + DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, + DnnlExtensionUtils::ElementTypeToDataType(dstPrc), + dnnl::memory::format_tag::io); + auto dstMem = std::make_shared(engine, dstMemoryDesc); auto srcFormat = needTranspose ? dnnl::memory::format_tag::oi : dnnl::memory::format_tag::io; - DnnlBlockedMemoryDesc srcMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), srcFormat); + + DnnlBlockedMemoryDesc srcMemoryDesc( + dstShape, + DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), + srcFormat); auto srcMem = std::make_shared(engine, srcMemoryDesc, paramsPtr->getData()); dstMem->load(*srcMem); @@ -615,25 +677,31 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, return dstMem; } -void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (scales_ptr == nullptr) return; auto scalesMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine); attr.set_scales_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = std::move(scalesMem); dnnlArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive(); } -void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (zero_points_ptr == nullptr) return; auto zeroPointsMem = prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine); attr.set_zero_points_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem; dnnlArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem->getPrimitive(); } diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h index c07ec0f608b6db..7ae634658b005f 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h @@ -12,8 +12,8 @@ #include "cpu_memory.h" #include "nodes/executors/dnnl/dnnl_aliases.hpp" -#include "post_ops.hpp" #include "nodes/executors/dnnl/dnnl_post_op_data.hpp" +#include "post_ops.hpp" namespace ov { namespace intel_cpu { @@ -27,12 +27,13 @@ class DnnlPostOpsComposer { const size_t indexOfOutputChannelDim, const bool isINT8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType); DnnlPrimitiveAttrs compose(); void appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision); - void appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, ov::element::Type dstPrecision); + void appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision); void setDynamicQuantizationParams(uint64_t groupSize); private: diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp index cb59492463f410..3e40ead65d6cc3 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.cpp @@ -3,9 +3,11 @@ // #include "dnnl_postops_composer_legacy.h" + #include #include + #include "utils/debug_capabilities.h" namespace ov { @@ -39,10 +41,10 @@ DnnlPostOpsComposerLegacy::DnnlPostOpsComposerLegacy(const dnnl::engine& engine, wei_scale_mask = wei_scale_values.size() > 1 ? weiScaleMaskPerChannel : 0; dst_scale_val = 1.0; - //set the DQscale into attr weight scale before appending any post-ops. + // set the DQscale into attr weight scale before appending any post-ops. updateWeiScales(); - //If having the bias, attr weight scale can't be updated for further ops-ops optimization. - //ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. + // If having the bias, attr weight scale can't be updated for further ops-ops optimization. + // ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. weightScaleAvailable = !hasBias; } else if (!DQScales.empty()) { // DQ scale is fused but swiching back to non-INT8 for execution in some cases. @@ -115,22 +117,22 @@ bool DnnlPostOpsComposerLegacy::appendScale(const std::vector& scale, boo return true; } if (weightScaleAvailable) { - //oneDNN v3.* weight scale can also be used in the further optimization patterns. - // there are so many possible optimizations can be done, for example: + // oneDNN v3.* weight scale can also be used in the further optimization patterns. + // there are so many possible optimizations can be done, for example: // - // we can switch the existing postOps's order to take - // advantage of output scale if it's available: - // relu(x)*scale = relu(x*scale) - // or we can fuse it into previous one as long as they are - // compatible in shape - // x*A*s = x*(A*s) - // or even with add: - // (x*A + B)*s = x*(A*s) + (B*s) - // or we can combine these two tricks: - // relu(x*A)*s = relu(x*(A*s)) + // we can switch the existing postOps's order to take + // advantage of output scale if it's available: + // relu(x)*scale = relu(x*scale) + // or we can fuse it into previous one as long as they are + // compatible in shape + // x*A*s = x*(A*s) + // or even with add: + // (x*A + B)*s = x*(A*s) + (B*s) + // or we can combine these two tricks: + // relu(x*A)*s = relu(x*(A*s)) // - // we cannot implement all of them, so we just add the one - // that we observed in real models. + // we cannot implement all of them, so we just add the one + // that we observed in real models. if ((ops.len() == 0)) fuseIntoWeiScale = true; @@ -201,9 +203,9 @@ bool DnnlPostOpsComposerLegacy::appendShift(const std::vector& shift, boo } bool DnnlPostOpsComposerLegacy::appendLinear(const std::vector& scale, - const std::vector& shift, - bool isLastPostOp, - bool allowBinary) { + const std::vector& shift, + bool isLastPostOp, + bool allowBinary) { if (scale.size() == 1 && shift.size() == 1) { if (shift[0] == 0.0f) return appendScale(scale, isLastPostOp, allowBinary); diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h index 82fdda94012f15..485fa31fb5d956 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer_legacy.h @@ -8,11 +8,10 @@ */ #pragma once -#include "dnnl_types.h" - #include #include "cpu_memory.h" +#include "dnnl_types.h" #include "memory_desc/cpu_memory_desc.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "onednn/dnnl.h" @@ -39,7 +38,10 @@ class DnnlPostOpsComposerLegacy { void appendRoundHTE(); bool appendScale(const std::vector& scale, bool isLastPostOp, bool allowBinary = true); bool appendShift(const std::vector& shift, bool allowBinary = true); - bool appendLinear(const std::vector& scale, const std::vector& shift, bool isLastPostOp, bool allowBinary = true); + bool appendLinear(const std::vector& scale, + const std::vector& shift, + bool isLastPostOp, + bool allowBinary = true); void appendClip(const std::vector& low, const std::vector& high); const VectorDims& getOutputDims() { diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index 82bde8edae2b4a..1eabc6275bf4b0 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -3,16 +3,21 @@ // #include "edge.h" -#include "node.h" + #include "dnnl_extension_utils.h" +#include "node.h" +#include "openvino/core/type/element_type.hpp" #include "openvino/util/pp.hpp" using namespace dnnl; namespace ov { namespace intel_cpu { -Edge::Edge(const NodePtr &parent, const NodePtr &child, int pr_port, int ch_port) : - parent(parent), child(child), parent_port(pr_port), child_port(ch_port) {} +Edge::Edge(const NodePtr& parent, const NodePtr& child, int pr_port, int ch_port) + : parent(parent), + child(child), + parent_port(pr_port), + child_port(ch_port) {} const NodePtr Edge::getParent() const { auto parentPtr = parent.lock(); @@ -38,14 +43,14 @@ bool Edge::isDropped() const { auto parent_ptr = parent.lock(); if (parent_ptr) { - for (auto &edge : parent_ptr->childEdges) + for (auto& edge : parent_ptr->childEdges) if (edge.lock().get() == this) not_in_parent = false; } auto child_ptr = child.lock(); if (child_ptr) { - for (auto &edge : child_ptr->parentEdges) + for (auto& edge : child_ptr->parentEdges) if (edge.lock().get() == this) not_in_child = false; } @@ -130,8 +135,8 @@ bool Edge::enforceReorder() { } static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const MemoryDesc& rhsMemDesc) { - if (!lhsMemDesc.isDefined() || !rhsMemDesc.isDefined() || - !(lhsMemDesc.getType() & MemoryDescType::Blocked) || !(rhsMemDesc.getType() & MemoryDescType::Blocked) || + if (!lhsMemDesc.isDefined() || !rhsMemDesc.isDefined() || !(lhsMemDesc.getType() & MemoryDescType::Blocked) || + !(rhsMemDesc.getType() & MemoryDescType::Blocked) || (lhsMemDesc.getType() == DnnlBlocked && !lhsMemDesc.as()->hasEmptyExtraData()) || (rhsMemDesc.getType() == DnnlBlocked && !rhsMemDesc.as()->hasEmptyExtraData())) return false; @@ -139,13 +144,21 @@ static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const M const auto lhsBlockMemDesc = lhsMemDesc.as(); const auto rhsBlockMemDesc = rhsMemDesc.as(); - if (lhsBlockMemDesc->getShape() != rhsBlockMemDesc->getShape() || lhsBlockMemDesc->getPrecision() != rhsBlockMemDesc->getPrecision()) + if (lhsBlockMemDesc->getShape() != rhsBlockMemDesc->getShape() || + lhsBlockMemDesc->getPrecision() != rhsBlockMemDesc->getPrecision()) return false; // dims padding check - bool isZeroDimsPaddings = - std::all_of(lhsBlockMemDesc->getOffsetPaddingToData().begin(), lhsBlockMemDesc->getOffsetPaddingToData().end(), [](size_t x){ return x == 0; }) && - std::all_of(rhsBlockMemDesc->getOffsetPaddingToData().begin(), rhsBlockMemDesc->getOffsetPaddingToData().end(), [](size_t x){ return x == 0; }); + bool isZeroDimsPaddings = std::all_of(lhsBlockMemDesc->getOffsetPaddingToData().begin(), + lhsBlockMemDesc->getOffsetPaddingToData().end(), + [](size_t x) { + return x == 0; + }) && + std::all_of(rhsBlockMemDesc->getOffsetPaddingToData().begin(), + rhsBlockMemDesc->getOffsetPaddingToData().end(), + [](size_t x) { + return x == 0; + }); bool isSameElementsCount = lhsBlockMemDesc->getPaddedElementsCount() == rhsBlockMemDesc->getPaddedElementsCount(); if (!isZeroDimsPaddings || !isSameElementsCount) return false; @@ -160,7 +173,8 @@ static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const M std::vector lhsStridesDefault(lhsBlockDims.size()); lhsStridesDefault[lhsBlockDims.size() - 1] = 1; for (size_t i = 2; i <= lhsBlockDims.size(); i++) { - lhsStridesDefault[lhsBlockDims.size() - i] = lhsStridesDefault[lhsBlockDims.size() - (i - 1)] * lhsBlockDims[lhsBlockDims.size() - (i - 1)]; + lhsStridesDefault[lhsBlockDims.size() - i] = + lhsStridesDefault[lhsBlockDims.size() - (i - 1)] * lhsBlockDims[lhsBlockDims.size() - (i - 1)]; } auto rhsBlockDims = rhsBlockMemDesc->getBlockDims(); @@ -168,11 +182,11 @@ static inline bool isPhycicalMemCompatible(const MemoryDesc& lhsMemDesc, const M rhsStridesDefault[rhsBlockDims.size() - 1] = 1; for (size_t i = 2; i <= rhsBlockDims.size(); i++) { rhsStridesDefault[rhsBlockDims.size() - i] = - rhsStridesDefault[rhsBlockDims.size() - (i - 1)] * rhsBlockDims[rhsBlockDims.size() - (i - 1)]; + rhsStridesDefault[rhsBlockDims.size() - (i - 1)] * rhsBlockDims[rhsBlockDims.size() - (i - 1)]; } - // this check needed to avoid inserting unnecessary reorders if the memory is used in place and the batch size is equal to 1 - // in nodes like concate and split + // this check needed to avoid inserting unnecessary reorders if the memory is used in place and the batch size is + // equal to 1 in nodes like concate and split size_t lhsSkipAxis = lhsBlockDims.size() > 0 && lhsBlockDims[0] == 1 ? 0 : Shape::UNDEFINED_DIM; size_t rhsSkipAxis = rhsBlockDims.size() > 0 && rhsBlockDims[0] == 1 ? 0 : Shape::UNDEFINED_DIM; @@ -212,10 +226,16 @@ Edge::ReorderStatus Edge::needReorder() { bool optimized = false; auto inputPortDesc = getInputPortDesc(); auto outPortDesc = getOutputPortDesc(); + + if (inputPortDesc->getMemDesc()->getPrecision() == element::undefined) + return ReorderStatus::No; + // Check whether the child node may accept the parent produced tensor if (!outPortDesc->isCompatible(*inputPortDesc)) { - // Performance optimization which exploit the fact that some tensors do not need actual data reordering to be read using different descriptors - if (isPhycicalMemCompatible(*inputPortDesc->getMemDesc(), *outPortDesc->getMemDesc()) && !getParent()->isConstant()) { + // Performance optimization which exploit the fact that some tensors do not need actual data reordering to be + // read using different descriptors + if (isPhycicalMemCompatible(*inputPortDesc->getMemDesc(), *outPortDesc->getMemDesc()) && + !getParent()->isConstant()) { optimized = true; } else { return ReorderStatus::Regular; @@ -292,8 +312,8 @@ std::string Edge::hash() const { std::stringstream result; - return parentPtr->getName() + "_" + std::to_string(parent_port) + "_" + - childPtr->getName() + "_" + std::to_string(child_port); + return parentPtr->getName() + "_" + std::to_string(parent_port) + "_" + childPtr->getName() + "_" + + std::to_string(child_port); } void Edge::externalAllocate(WeightsSharing::Ptr weightsCache) { @@ -301,10 +321,13 @@ void Edge::externalAllocate(WeightsSharing::Ptr weightsCache) { return; if (weightsCache) { - auto alloc = [this] () { + auto alloc = [this]() { auto allocateFunc = [this](const MemoryDesc& inputDesc) -> MemoryPtr { auto parentPtr = getParent(); - return std::make_shared(parentPtr->getEngine(), inputDesc, nullptr, false); // no pads zeroing + return std::make_shared(parentPtr->getEngine(), + inputDesc, + nullptr, + false); // no pads zeroing }; allocateCommon(allocateFunc); @@ -410,13 +433,16 @@ const MemoryDesc& Edge::getOutputDesc() const { } const MemoryDesc& Edge::getDesc() const { + if (getInputDesc().getPrecision() == element::undefined) + return getInputDesc(); + if (!getInputDesc().isCompatible(getOutputDesc())) OPENVINO_THROW("Cannot get descriptor for edge: ", getParent()->getName(), "->", getChild()->getName()); return getInputDesc(); } -const IMemory &Edge::getMemory() { +const IMemory& Edge::getMemory() { auto memPtr = getMemoryPtr(); OPENVINO_ASSERT(memPtr != nullptr, " Dereferencing NULL memory in edge: ", *this); return *memPtr; @@ -426,7 +452,7 @@ MemoryPtr Edge::getMemoryPtr() const { return memoryPtr; } -void Edge::sharedMemFrom(const EdgePtr &edge) { +void Edge::sharedMemFrom(const EdgePtr& edge) { memoryFromEdge = edge; DEBUG_LOG(*this, " sharedMemFrom ", *edge); status = Status::NotAllocated; @@ -466,10 +492,8 @@ void Edge::init() { DEBUG_LOG(*this, " getBaseEdge() return itself"); changeStatus(Status::NeedAllocation); } else { - if (Type::Input == edgePtr->getParent()->getType() && - Type::MemoryInput != getParent()->getType() && - edgePtr->getParent()->isConstant() && - !edgePtr->getChild()->isConstant()) { + if (Type::Input == edgePtr->getParent()->getType() && Type::MemoryInput != getParent()->getType() && + edgePtr->getParent()->isConstant() && !edgePtr->getChild()->isConstant()) { changeStatus(Status::NeedAllocation); DEBUG_LOG(*this, " edge inplace from ", *edgePtr, " is broken!"); return; @@ -497,11 +521,11 @@ EdgePtr Edge::getBaseEdge(int look) { if ((childInPlacePort >= 0) && (look & LOOK_DOWN)) { auto ch_edges = getChild()->getChildEdgesAtPort(childInPlacePort); - auto &next_ch_edge = ch_edges[0]; + auto& next_ch_edge = ch_edges[0]; // Multiple connection to some out port // Will try to find inplace consumer - for (auto &ch_edge : ch_edges) { + for (auto& ch_edge : ch_edges) { if (ch_edge->getChild()->inPlaceInputPort(ch_edge->getOutputNum()) >= 0) { next_ch_edge = ch_edge; // To align with upstream-inplace, we stop searching once found the first inplace consumer @@ -517,14 +541,16 @@ EdgePtr Edge::getBaseEdge(int look) { for (auto edge : edgesForSamePort) { if (edge.get() != this) { // Return once found the first inplace consumer - if (edge->inPlace()) return edge; + if (edge->inPlace()) + return edge; } } // Return the first output edge as the base if there is no inPlace consumers // thus benefits zero-copy of outputs. for (auto edge : edgesForSamePort) { - if (Type::Output == edge->getChild()->getType()) return edge; + if (Type::Output == edge->getChild()->getType()) + return edge; } return edgesForSamePort[0]; @@ -571,7 +597,7 @@ NodePtr Edge::modifiedInPlace() const { for (size_t i = 0; i < outConfs.size(); ++i) { const auto& conf = outConfs[i]; if (childPort < 0 || conf.inPlace() != childPort || - Type::MemoryInput == childNode->getType()) { //exception type, it doesn't modify memory + Type::MemoryInput == childNode->getType()) { // exception type, it doesn't modify memory continue; } if (childNode->isExecutable()) { @@ -591,12 +617,14 @@ NodePtr Edge::modifiedInPlace() const { return nullptr; } -std::ostream& operator<<(std::ostream &os, const Edge& edge) { - return os << "(" << edge.getParent()->getName() << ")" << "[" << edge.getInputNum() << "] " +std::ostream& operator<<(std::ostream& os, const Edge& edge) { + return os << "(" << edge.getParent()->getName() << ")" + << "[" << edge.getInputNum() << "] " << "<->" - << "(" << edge.getChild()->getName() << ")" << "[" << edge.getOutputNum() << "]" + << "(" << edge.getChild()->getName() << ")" + << "[" << edge.getOutputNum() << "]" << ":" << Edge::statusToString(edge.getStatus()); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/edge.h b/src/plugins/intel_cpu/src/edge.h index 5c418b2665924d..38f49ff00db075 100644 --- a/src/plugins/intel_cpu/src/edge.h +++ b/src/plugins/intel_cpu/src/edge.h @@ -4,15 +4,15 @@ #pragma once +#include +#include + #include "cpu_shape.h" #include "internal_properties.hpp" #include "memory_desc/cpu_memory_desc.h" #include "nodes/node_config.h" #include "weights_cache.hpp" -#include -#include - namespace ov { namespace intel_cpu { @@ -24,23 +24,11 @@ using EdgeWeakPtr = std::weak_ptr; class Edge { public: - Edge(const std::shared_ptr& parent, - const std::shared_ptr& child, - int pr_port = 0, int ch_port = 0); - - enum class Status { - Uninitialized, - NeedAllocation, - NotAllocated, - Allocated, - Validated - }; - - enum class ReorderStatus { - Regular = 0, - Optimized = 1, - No = 2 - }; + Edge(const std::shared_ptr& parent, const std::shared_ptr& child, int pr_port = 0, int ch_port = 0); + + enum class Status { Uninitialized, NeedAllocation, NotAllocated, Allocated, Validated }; + + enum class ReorderStatus { Regular = 0, Optimized = 1, No = 2 }; enum LOOK { LOOK_UP = 1, LOOK_DOWN = 2, LOOK_BOTH = LOOK_UP | LOOK_DOWN }; @@ -52,15 +40,15 @@ class Edge { #define CASE(_status) \ case Status::_status: \ return #_status; - switch (status) { - CASE(Uninitialized); - CASE(NeedAllocation); - CASE(NotAllocated); - CASE(Allocated); - CASE(Validated); - } + switch (status) { + CASE(Uninitialized); + CASE(NeedAllocation); + CASE(NotAllocated); + CASE(Allocated); + CASE(Validated); + } #undef CASE - return "Unexpected"; + return "Unexpected"; } void changeStatus(Status state); @@ -87,7 +75,9 @@ class Edge { int getInputNum() const; int getOutputNum() const; - void setChildPort(const size_t port) { child_port = port; } + void setChildPort(const size_t port) { + child_port = port; + } void sharedMemFrom(const EdgePtr& edge); EdgePtr getSharedEdge() const; @@ -126,8 +116,7 @@ class Edge { friend class Graph; }; -std::ostream& operator<<(std::ostream &os, const Edge& edge); - -} // namespace intel_cpu -} // namespace ov +std::ostream& operator<<(std::ostream& os, const Edge& edge); +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 83cdd252f9bc6f..39a2d20c092835 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -516,6 +516,51 @@ std::set> jit_floor_emitter::get_supported_precisions return {{element::f32}}; } +/// FLOOR_MOD /// +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { +} + +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc): jit_emitter(host, host_isa, exec_prc) { +} + +size_t jit_floor_mod_emitter::get_inputs_count() const { return 2; } + +size_t jit_floor_mod_emitter::get_aux_vecs_count() const { return 1; } + +void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg dividend = TReg(in_vec_idxs[0]); + TReg divisor = TReg(in_vec_idxs[1]); + TReg r = TReg(out_vec_idxs[0]); + TReg aux = TReg(aux_vec_idxs[0]); + + h->fdiv(aux.s, dividend.s, divisor.s); + h->frintm(aux.s, aux.s); + h->fmul(aux.s, aux.s, divisor.s); + h->fsub(r.s, dividend.s, aux.s); +} + +std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + /// CEILING /// //Initialization of the emitter, taking node as input jit_ceiling_emitter::jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, @@ -1934,6 +1979,88 @@ void jit_relu_emitter::emit_isa(const std::vector &in_vec_idxs, const st h->fmaxnm(dst.s, src.s, tmp.s); } +/// ROUND_HALF_AWAY_FROM_ZERO /// +jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter + (dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { +} + +jit_round_half_away_from_zero_emitter::jit_round_half_away_from_zero_emitter + (dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { +} + +size_t jit_round_half_away_from_zero_emitter::get_inputs_count() const { return 1; } + +std::set> jit_round_half_away_from_zero_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32}}; +} + +void jit_round_half_away_from_zero_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_round_half_away_from_zero_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg src = TReg(in_vec_idxs[0]); + TReg dst = TReg(out_vec_idxs[0]); + + h->frinta(dst.s, src.s); +} + +/// ROUND_HALF_TO_EVEN /// +jit_round_half_to_even_emitter::jit_round_half_to_even_emitter + (dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { +} + +jit_round_half_to_even_emitter::jit_round_half_to_even_emitter + (dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { +} + +size_t jit_round_half_to_even_emitter::get_inputs_count() const { return 1; } + +std::set> jit_round_half_to_even_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32}}; +} + +void jit_round_half_to_even_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_round_half_to_even_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg src = TReg(in_vec_idxs[0]); + TReg dst = TReg(out_vec_idxs[0]); + + h->frintn(dst.s, src.s); +} + /// SELECT /// jit_select_emitter::jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index fa4f4141c388e4..2173a1487f1057 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -213,7 +213,28 @@ class jit_floor_emitter : public jit_emitter { template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; }; +class jit_floor_mod_emitter : public jit_emitter { +public: + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; class jit_ceiling_emitter : public jit_emitter { public: // Constructor with explicit precision @@ -810,6 +831,48 @@ class jit_relu_emitter : public jit_emitter { void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; }; +class jit_round_half_away_from_zero_emitter : public jit_emitter { +public: + jit_round_half_away_from_zero_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_round_half_away_from_zero_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + +class jit_round_half_to_even_emitter : public jit_emitter { +public: + jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_round_half_to_even_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; + class jit_select_emitter : public jit_emitter { public: jit_select_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp index 01af9dbde7fe01..c2c6ddf6f271fc 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.cpp @@ -4,9 +4,10 @@ #ifdef CPU_DEBUG_CAPS -#include "debug_capabilities.hpp" -#include -#include +# include "debug_capabilities.hpp" + +# include +# include namespace ov { namespace intel_cpu { @@ -14,25 +15,26 @@ namespace intel_cpu { using namespace Xbyak; using namespace dnnl::impl::cpu::x64; -template void RegPrinter::print(jit_generator &h, Xmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Xmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Ymm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Ymm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Zmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Zmm reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg64 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg64 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg32 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg32 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg16 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg16 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg8 reg, const char *name); -template void RegPrinter::print(jit_generator &h, Reg8 reg, const char *name); +template void RegPrinter::print(jit_generator& h, Xmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Xmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Ymm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Ymm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Zmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Zmm reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg64 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg64 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg32 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg32 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg16 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg16 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg8 reg, const char* name); +template void RegPrinter::print(jit_generator& h, Reg8 reg, const char* name); template -void RegPrinter::print_reg_prc(const char *name, const char *ori_name, T *ptr) { +void RegPrinter::print_reg_prc(const char* name, const char* ori_name, T* ptr) { std::stringstream ss; - if (name) ss << name << " | "; + if (name) + ss << name << " | "; ss << ori_name << ": "; if (std::is_floating_point::value) { ss << *ptr; @@ -48,9 +50,10 @@ void RegPrinter::print_reg_prc(const char *name, const char *ori_name, T *ptr) { } template -void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, PRC_T *ptr) { +void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, PRC_T* ptr) { std::stringstream ss; - if (name) ss << name << " | "; + if (name) + ss << name << " | "; ss << ori_name << ": {" << ptr[0]; for (size_t i = 1; i < vlen / sizeof(float); i++) { ss << ", " << ptr[i]; @@ -58,15 +61,15 @@ void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, PRC_T *pt ss << "}" << std::endl; std::cout << ss.str(); } -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, float *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, float *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, float *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, int *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, int *ptr); -template void RegPrinter::print_vmm_prc(const char *name, const char *ori_name, int *ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, float* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, float* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, float* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, int* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, int* ptr); +template void RegPrinter::print_vmm_prc(const char* name, const char* ori_name, int* ptr); template -struct vmm_traits{}; +struct vmm_traits {}; template <> struct vmm_traits { @@ -87,7 +90,7 @@ struct vmm_traits { }; template -void RegPrinter::save_vmm(jit_generator &h) { +void RegPrinter::save_vmm(jit_generator& h) { h.sub(h.rsp, vmm_traits::vmm_len * vmm_traits::vmm_cnt); for (size_t i = 0; i < vmm_traits::vmm_cnt; i++) { h.uni_vmovups(h.ptr[h.rsp + i * vmm_traits::vmm_len], T(i)); @@ -95,52 +98,52 @@ void RegPrinter::save_vmm(jit_generator &h) { } template -void RegPrinter::restore_vmm(jit_generator &h) { +void RegPrinter::restore_vmm(jit_generator& h) { for (size_t i = 0; i < vmm_traits::vmm_cnt; i++) { h.uni_vmovups(T(i), h.ptr[h.rsp + i * vmm_traits::vmm_len]); } h.add(h.rsp, vmm_traits::vmm_len * vmm_traits::vmm_cnt); } -void RegPrinter::save_reg(jit_generator &h) { +void RegPrinter::save_reg(jit_generator& h) { h.sub(h.rsp, reg_len * reg_cnt); for (size_t i = 0; i < reg_cnt; i++) { h.mov(h.ptr[h.rsp + i * reg_len], Reg64(i)); } } -void RegPrinter::restore_reg(jit_generator &h) { +void RegPrinter::restore_reg(jit_generator& h) { for (size_t i = 0; i < reg_cnt; i++) { h.mov(Reg64(i), h.ptr[h.rsp + i * reg_len]); } h.add(h.rsp, reg_len * reg_cnt); } -void RegPrinter::preamble(jit_generator &h) { +void RegPrinter::preamble(jit_generator& h) { save_reg(h); - mayiuse(cpu_isa_t::avx512_core) ? save_vmm(h) : (mayiuse(cpu_isa_t::avx2) ? - save_vmm(h) : save_vmm(h)); + mayiuse(cpu_isa_t::avx512_core) ? save_vmm(h) + : (mayiuse(cpu_isa_t::avx2) ? save_vmm(h) : save_vmm(h)); } -void RegPrinter::postamble(jit_generator &h) { - mayiuse(cpu_isa_t::avx512_core) ? restore_vmm(h) : (mayiuse(cpu_isa_t::avx2) ? - restore_vmm(h) : restore_vmm(h)); +void RegPrinter::postamble(jit_generator& h) { + mayiuse(cpu_isa_t::avx512_core) ? restore_vmm(h) + : (mayiuse(cpu_isa_t::avx2) ? restore_vmm(h) : restore_vmm(h)); restore_reg(h); } // ABI requires 16-bype stack alignment before a call -void RegPrinter::align_rsp(jit_generator &h) { +void RegPrinter::align_rsp(jit_generator& h) { constexpr int alignment = 16; h.mov(h.r15, h.rsp); h.and_(h.rsp, ~(alignment - 1)); } -void RegPrinter::restore_rsp(jit_generator &h) { +void RegPrinter::restore_rsp(jit_generator& h) { h.mov(h.rsp, h.r15); } template -void RegPrinter::print_vmm(jit_generator &h, REG_T vmm, const char *name) { +void RegPrinter::print_vmm(jit_generator& h, REG_T vmm, const char* name) { preamble(h); h.push(h.rax); @@ -181,7 +184,7 @@ void RegPrinter::print_vmm(jit_generator &h, REG_T vmm, const char *name) { } template -void RegPrinter::print_reg(jit_generator &h, REG_T reg, const char *name) { +void RegPrinter::print_reg(jit_generator& h, REG_T reg, const char* name) { preamble(h); h.push(h.rax); @@ -213,8 +216,7 @@ void RegPrinter::print_reg(jit_generator &h, REG_T reg, const char *name) { postamble(h); } -} // namespace intel_cpu -} // namespace ov - +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp index fd7135b17bf5b9..dcac847dfd1e0f 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/debug_capabilities.hpp @@ -6,7 +6,7 @@ #ifdef CPU_DEBUG_CAPS -#include "cpu/x64/jit_generator.hpp" +# include "cpu/x64/jit_generator.hpp" namespace ov { namespace intel_cpu { @@ -56,42 +56,44 @@ namespace intel_cpu { class RegPrinter { public: using jit_generator = dnnl::impl::cpu::x64::jit_generator; - template ::value, int>::type = 0> - static void print(jit_generator &h, REG_T reg, const char *name = nullptr) { + template ::value, int>::type = 0> + static void print(jit_generator& h, REG_T reg, const char* name = nullptr) { print_vmm(h, reg, name); } - template ::value, int>::type = 0> - static void print(jit_generator &h, REG_T reg, const char *name = nullptr) { + template ::value, int>::type = 0> + static void print(jit_generator& h, REG_T reg, const char* name = nullptr) { print_reg(h, reg, name); } private: RegPrinter() {} template - static void print_vmm(jit_generator &h, REG_T vmm, const char *name); + static void print_vmm(jit_generator& h, REG_T vmm, const char* name); template - static void print_reg(jit_generator &h, REG_T reg, const char *name); + static void print_reg(jit_generator& h, REG_T reg, const char* name); template - static void print_vmm_prc(const char *name, const char *ori_name, PRC_T *ptr); + static void print_vmm_prc(const char* name, const char* ori_name, PRC_T* ptr); template - static void print_reg_prc(const char *name, const char *ori_name, T *val); - static void preamble(jit_generator &h); - static void postamble(jit_generator &h); + static void print_reg_prc(const char* name, const char* ori_name, T* val); + static void preamble(jit_generator& h); + static void postamble(jit_generator& h); template - static void save_vmm(jit_generator &h); + static void save_vmm(jit_generator& h); template - static void restore_vmm(jit_generator &h); - static void save_reg(jit_generator &h); - static void restore_reg(jit_generator &h); - static void align_rsp(jit_generator &h); - static void restore_rsp(jit_generator &h); + static void restore_vmm(jit_generator& h); + static void save_reg(jit_generator& h); + static void restore_reg(jit_generator& h); + static void align_rsp(jit_generator& h); + static void restore_rsp(jit_generator& h); static constexpr size_t reg_len = 8; static constexpr size_t reg_cnt = 16; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp index 43a2c2eb6b045f..2bfbaa68880aa8 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp @@ -11,14 +11,18 @@ namespace intel_cpu { class jit_uni_vcvtneps2bf16 : public jit_emitter { public: - jit_uni_vcvtneps2bf16(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::bf16) : jit_emitter(host, host_isa, exec_prc) { + jit_uni_vcvtneps2bf16(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::bf16) + : jit_emitter(host, host_isa, exec_prc) { if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16) && !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) prepare_table(); } - size_t get_inputs_num() const override { return 1; } + size_t get_inputs_num() const override { + return 1; + } private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override { @@ -36,7 +40,8 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { template void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using namespace Xbyak; - using Vmm = typename dnnl::impl::utils::conditional3::type; + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Vmm in = Vmm(in_vec_idxs[0]); @@ -79,7 +84,7 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { h->uni_vpackusdw(aux, aux, aux); if (host_isa_ == dnnl::impl::cpu::x64::cpu_isa_t::avx2) { - h->vpermq(Ymm(aux.getIdx()), Ymm(aux.getIdx()), 0xD8); //11 01 10 00 + h->vpermq(Ymm(aux.getIdx()), Ymm(aux.getIdx()), 0xD8); // 11 01 10 00 h->vextracti128(out, Ymm(aux.getIdx()), 0); } else { h->uni_vmovups(out, aux); @@ -123,5 +128,5 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp index 544960008c9158..2e90af39fb9cf1 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.cpp @@ -6,7 +6,6 @@ #include "utils/bfloat16.hpp" - using namespace dnnl::impl::utils; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -15,19 +14,23 @@ using namespace Xbyak; namespace ov { namespace intel_cpu { -jit_convert_emitter::jit_convert_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_convert_emitter::jit_convert_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { input_type = node->get_input_element_type(0); output_type = node->get_output_element_type(0); if (output_type == ov::element::bf16) - uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(host, host_isa)); + uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(host, host_isa)); } void jit_convert_emitter::validate_types() const { auto is_supported_type = [this](const ov::element::Type& type) { - return any_of(supported_types.begin(), supported_types.end(), - [&type](const ov::element::Type& supported_type) { return supported_type == type; } ); + return any_of(supported_types.begin(), supported_types.end(), [&type](const ov::element::Type& supported_type) { + return supported_type == type; + }); }; if (!is_supported_type(input_type)) @@ -36,7 +39,9 @@ void jit_convert_emitter::validate_types() const { OV_CPU_JIT_EMITTER_THROW("Unsupported output type: ", output_type.get_type_name()); } -size_t jit_convert_emitter::get_inputs_num() const { return 1; } +size_t jit_convert_emitter::get_inputs_num() const { + return 1; +} void jit_convert_emitter::emit_data() const { jit_emitter::emit_data(); @@ -45,19 +50,22 @@ void jit_convert_emitter::emit_data() const { } template -void jit_convert_emitter::float2bfloat(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_emitter::float2bfloat(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); if (!uni_vcvtneps2bf16) OV_CPU_JIT_EMITTER_THROW("Converter from float to bf16 isn't initialized!"); uni_vcvtneps2bf16->emit_code({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); } -jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { +jit_convert_truncation_emitter::jit_convert_truncation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) { prepare_table(); } @@ -66,7 +74,8 @@ bool jit_convert_truncation_emitter::is_i8_and_u8_case() const { one_of(output_type, ov::element::i8, ov::element::u8); } -void jit_convert_truncation_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_truncation_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { validate_types(); if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); @@ -80,10 +89,11 @@ void jit_convert_truncation_emitter::emit_impl(const std::vector &in_vec } template -void jit_convert_truncation_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_truncation_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); Xmm xmm_dst = Xmm(out_vec_idxs[0]); Ymm ymm_dst = Ymm(out_vec_idxs[0]); @@ -97,95 +107,95 @@ void jit_convert_truncation_emitter::emit_isa(const std::vector &in_vec_ } switch (input_type) { - case ov::element::f32: - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_src); - break; - case ov::element::i32: - if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) - h->uni_vcvtdq2ps(vmm_dst, vmm_src); - break; - case ov::element::bf16: - h->vpmovzxwd(vmm_dst, vmm_src); - h->uni_vpslld(vmm_dst, vmm_dst, 16); - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::f16: - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); - else - h->vcvtph2ps(vmm_dst, - Xmm(vmm_src.getIdx())); // for avx2_vnni_2? - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::i8: - h->uni_vpmovsxbd(vmm_dst, vmm_src); - break; - case ov::element::u8: - h->uni_vpmovzxbd(vmm_dst, vmm_src); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); + case ov::element::f32: + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_src); + break; + case ov::element::i32: + if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) + h->uni_vcvtdq2ps(vmm_dst, vmm_src); + break; + case ov::element::bf16: + h->vpmovzxwd(vmm_dst, vmm_src); + h->uni_vpslld(vmm_dst, vmm_dst, 16); + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::f16: + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); + else + h->vcvtph2ps(vmm_dst, + Xmm(vmm_src.getIdx())); // for avx2_vnni_2? + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i8: + h->uni_vpmovsxbd(vmm_dst, vmm_src); + break; + case ov::element::u8: + h->uni_vpmovzxbd(vmm_dst, vmm_src); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); } switch (output_type) { - case ov::element::f32: - if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + case ov::element::f32: + if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + break; + case ov::element::i32: + break; + case ov::element::bf16: + if (input_type == ov::element::f32) { + float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - case ov::element::i32: - break; - case ov::element::bf16: - if (input_type == ov::element::f32) { - float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } - break; - case ov::element::f16: - if (input_type == ov::element::f32) { - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_src, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_src, 0x4); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); - } - break; - case ov::element::i8: - case ov::element::u8: - if (input_type == ov::element::i32) { - dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } else { - dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + case ov::element::f16: + if (input_type == ov::element::f32) { + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_src, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_src, 0x4); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); + } + break; + case ov::element::i8: + case ov::element::u8: + if (input_type == ov::element::i32) { + dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); } } void jit_convert_truncation_emitter::register_table_entries() { - if (host_isa_ == dnnl::impl::cpu::x64::avx2 && - one_of(output_type, ov::element::i8, ov::element::u8) && + if (host_isa_ == dnnl::impl::cpu::x64::avx2 && one_of(output_type, ov::element::i8, ov::element::u8) && !is_i8_and_u8_case()) push_arg_entry_of("mask_byte", 0x000000ff, true); } template -void jit_convert_truncation_emitter::dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_truncation_emitter::dword2int8(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); @@ -204,12 +214,14 @@ void jit_convert_truncation_emitter::dword2int8(const std::vector &in_ve } } -jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator *host, cpu_isa_t host_isa, - const std::shared_ptr& node, ov::element::Type exec_prc) - : jit_convert_emitter(host, host_isa, node, exec_prc) { -} +jit_convert_saturation_emitter::jit_convert_saturation_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_convert_emitter(host, host_isa, node, exec_prc) {} -void jit_convert_saturation_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_saturation_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { validate_types(); if (host_isa_ == cpu::x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); @@ -223,10 +235,11 @@ void jit_convert_saturation_emitter::emit_impl(const std::vector &in_vec } template -void jit_convert_saturation_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_convert_saturation_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); Xmm xmm_dst = Xmm(out_vec_idxs[0]); Ymm ymm_dst = Ymm(out_vec_idxs[0]); @@ -237,88 +250,94 @@ void jit_convert_saturation_emitter::emit_isa(const std::vector &in_vec_ } switch (input_type) { - case ov::element::f32: - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvtps2dq(vmm_dst, vmm_src); - break; - case ov::element::i32: - if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) - h->uni_vcvtdq2ps(vmm_dst, vmm_src); - break; - case ov::element::bf16: - h->vpmovzxwd(vmm_dst, vmm_src); - h->uni_vpslld(vmm_dst, vmm_dst, 16); - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::f16: - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); - else - h->vcvtph2ps(vmm_dst, - Xmm(vmm_src.getIdx())); // for avx2_vnni_2? - if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) - h->uni_vcvttps2dq(vmm_dst, vmm_dst); - break; - case ov::element::i8: - h->uni_vpmovsxbd(vmm_dst, vmm_src); - break; - case ov::element::u8: - h->uni_vpmovzxbd(vmm_dst, vmm_src); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); + case ov::element::f32: + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvtps2dq(vmm_dst, vmm_src); + break; + case ov::element::i32: + if (one_of(output_type, ov::element::f32, ov::element::bf16, ov::element::f16)) + h->uni_vcvtdq2ps(vmm_dst, vmm_src); + break; + case ov::element::bf16: + h->vpmovzxwd(vmm_dst, vmm_src); + h->uni_vpslld(vmm_dst, vmm_dst, 16); + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::f16: + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtph2ps(vmm_dst, Ymm(vmm_src.getIdx())); + else + h->vcvtph2ps(vmm_dst, + Xmm(vmm_src.getIdx())); // for avx2_vnni_2? + if (one_of(output_type, ov::element::i32, ov::element::i8, ov::element::u8)) + h->uni_vcvttps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i8: + h->uni_vpmovsxbd(vmm_dst, vmm_src); + break; + case ov::element::u8: + h->uni_vpmovzxbd(vmm_dst, vmm_src); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported input data type"); } switch (output_type) { - case ov::element::f32: - if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + case ov::element::f32: + if (!one_of(input_type, ov::element::i32, ov::element::bf16, ov::element::f16)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); + } + break; + case ov::element::i32: + break; + case ov::element::bf16: + if (input_type == ov::element::f32) { + float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - case ov::element::i32: - break; - case ov::element::bf16: - if (input_type == ov::element::f32) { - float2bfloat({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); - } - break; - case ov::element::f16: - if (input_type == ov::element::f32) { - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_src, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_src, 0x4); - } else { - if (one_of(input_type, ov::element::i8, ov::element::u8)) { - h->uni_vcvtdq2ps(vmm_dst, vmm_dst); - } - if (isa == dnnl::impl::cpu::x64::avx512_core) - h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); - else - h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); - } - break; - case ov::element::i8: - case ov::element::u8: - if (input_type == ov::element::i32) { - dword2int8({static_cast(vmm_src.getIdx())}, {static_cast(vmm_dst.getIdx())}, output_type.is_signed()); - } else { - dword2int8({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}, output_type.is_signed()); + float2bfloat({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}); + } + break; + case ov::element::f16: + if (input_type == ov::element::f32) { + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_src, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_src, 0x4); + } else { + if (one_of(input_type, ov::element::i8, ov::element::u8)) { + h->uni_vcvtdq2ps(vmm_dst, vmm_dst); } - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); + if (isa == dnnl::impl::cpu::x64::avx512_core) + h->vcvtps2ph(ymm_dst, vmm_dst, 0x4); + else + h->vcvtps2ph(xmm_dst, vmm_dst, 0x4); + } + break; + case ov::element::i8: + case ov::element::u8: + if (input_type == ov::element::i32) { + dword2int8({static_cast(vmm_src.getIdx())}, + {static_cast(vmm_dst.getIdx())}, + output_type.is_signed()); + } else { + dword2int8({static_cast(vmm_dst.getIdx())}, + {static_cast(vmm_dst.getIdx())}, + output_type.is_signed()); + } + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported output data type"); } } template -void jit_convert_saturation_emitter::dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, bool is_signed) const { +void jit_convert_saturation_emitter::dword2int8(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + bool is_signed) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); @@ -330,7 +349,7 @@ void jit_convert_saturation_emitter::dword2int8(const std::vector &in_ve if (is_signed) { h->vpmovsdb(xmm_dst, vmm_src); } else { - Vmm vmm_zero = Vmm(aux_vec_idxs[0]); + Vmm vmm_zero = Vmm(aux_vec_idxs[0]); h->vpxord(vmm_zero, vmm_zero, vmm_zero); h->vpmaxsd(vmm_dst, vmm_src, vmm_zero); h->vpmovusdb(xmm_dst, vmm_dst); @@ -353,8 +372,8 @@ void jit_convert_saturation_emitter::dword2int8(const std::vector &in_ve size_t jit_convert_saturation_emitter::aux_vecs_count() const { // 1 register is for dword2int8 unsigned - return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core? 1 : 0; + return output_type == ov::element::u8 && host_isa_ == dnnl::impl::cpu::x64::avx512_core ? 1 : 0; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp index ee451ed358dd1a..29b85079573bee 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_conversion_emitters.hpp @@ -4,16 +4,18 @@ #pragma once -#include "jit_emitter.hpp" #include "jit_bf16_emitters.hpp" +#include "jit_emitter.hpp" namespace ov { namespace intel_cpu { class jit_convert_emitter : public jit_emitter { public: - jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; @@ -22,19 +24,13 @@ class jit_convert_emitter : public jit_emitter { void validate_types() const; template - void float2bfloat(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void float2bfloat(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; ov::element::Type input_type; ov::element::Type output_type; - const ov::element::TypeVector supported_types = { - ov::element::f32, - ov::element::i32, - ov::element::bf16, - ov::element::f16, - ov::element::i8, - ov::element::u8 - }; + const ov::element::TypeVector supported_types = + {ov::element::f32, ov::element::i32, ov::element::bf16, ov::element::f16, ov::element::i8, ov::element::u8}; std::shared_ptr uni_vcvtneps2bf16 = nullptr; }; @@ -45,16 +41,18 @@ class jit_convert_emitter : public jit_emitter { // 129 -> -127 class jit_convert_truncation_emitter : public jit_convert_emitter { public: - jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_truncation_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; template - void dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void dword2int8(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; bool is_i8_and_u8_case() const; void register_table_entries() override; @@ -66,19 +64,23 @@ class jit_convert_truncation_emitter : public jit_convert_emitter { // 129 -> 127 class jit_convert_saturation_emitter : public jit_convert_emitter { public: - jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); + jit_convert_saturation_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; template - void dword2int8(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, bool is_signed) const; + void dword2int8(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + bool is_signed) const; size_t aux_vecs_count() const override; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp index 0b315cdd309715..51e801208b927c 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_dnnl_emitters.hpp" + #include using namespace dnnl::impl::utils; @@ -17,9 +18,11 @@ std::set> jit_dnnl_emitter::get_supported_precisions( return {{element::f32}}; } -jit_dnnl_emitter::jit_dnnl_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) +jit_dnnl_emitter::jit_dnnl_emitter(jit_generator* host, + cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { - kind = dnnl_eltwise_tanh; alpha = 0.f; beta = 0.f; @@ -27,33 +30,42 @@ jit_dnnl_emitter::jit_dnnl_emitter(jit_generator *host, cpu_isa_t host_isa, cons set_injector(); } -jit_dnnl_emitter::jit_dnnl_emitter(jit_generator *host, cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float alpha, float beta, +jit_dnnl_emitter::jit_dnnl_emitter(jit_generator* host, + cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float alpha, + float beta, ov::element::Type exec_prc) - : jit_emitter(host, host_isa, exec_prc), kind(algKind), alpha(alpha), beta(beta) { - + : jit_emitter(host, host_isa, exec_prc), + kind(algKind), + alpha(alpha), + beta(beta) { set_injector(); } void jit_dnnl_emitter::set_injector() { if (host_isa_ == cpu::x64::sse41) { - eltwise_injector_sse42 = std::make_shared>( - h, kind, alpha, beta, 1.f); + eltwise_injector_sse42 = + std::make_shared>(h, kind, alpha, beta, 1.f); } else if (host_isa_ == cpu::x64::avx2) { - eltwise_injector_avx2 = std::make_shared>( - h, kind, alpha, beta, 1.f); + eltwise_injector_avx2 = + std::make_shared>(h, kind, alpha, beta, 1.f); } else if (host_isa_ == cpu::x64::avx512_core) { - eltwise_injector_avx512_core = std::make_shared>( - h, kind, alpha, beta, 1.f); + eltwise_injector_avx512_core = + std::make_shared>(h, kind, alpha, beta, 1.f); } else { OV_CPU_JIT_EMITTER_THROW("Unsupported ISA ", host_isa_); } } -size_t jit_dnnl_emitter::get_inputs_num() const { return 1; } +size_t jit_dnnl_emitter::get_inputs_num() const { + return 1; +} -void jit_dnnl_emitter::emit_code(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_dnnl_emitter::emit_code(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { if (host_isa_ == cpu::x64::sse41) { if (out_vec_idxs[0] != in_vec_idxs[0]) h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0])); @@ -83,11 +95,13 @@ void jit_dnnl_emitter::emit_data() const { } } -jit_dnnl_aux_emitter::jit_dnnl_aux_emitter(jit_generator *host, cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float inpAlpha, float inpBeta, +jit_dnnl_aux_emitter::jit_dnnl_aux_emitter(jit_generator* host, + cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float inpAlpha, + float inpBeta, ov::element::Type exec_prc) - : jit_dnnl_emitter(host, host_isa, algKind, inpAlpha, inpBeta, exec_prc) { -} + : jit_dnnl_emitter(host, host_isa, algKind, inpAlpha, inpBeta, exec_prc) {} -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp index bdf04108370ed5..22e003ad261555 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_emitters.hpp @@ -4,8 +4,8 @@ #pragma once -#include "cpu/x64/jit_generator.hpp" #include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" +#include "cpu/x64/jit_generator.hpp" #include "jit_emitter.hpp" namespace ov { @@ -13,30 +13,41 @@ namespace intel_cpu { class jit_dnnl_emitter : public jit_emitter { public: - void emit_code(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const override; + void emit_code(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; void emit_data() const override; - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override {}; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override{}; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); protected: - jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float inpAlpha, float inpBeta, - ov::element::Type exec_prc = ov::element::f32); - jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float inpAlpha, + float inpBeta, + ov::element::Type exec_prc = ov::element::f32); + jit_dnnl_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); void set_injector(); - dnnl_alg_kind_t kind {dnnl_alg_kind_undef}; - float alpha {0.f}; - float beta {0.f}; + dnnl_alg_kind_t kind{dnnl_alg_kind_undef}; + float alpha{0.f}; + float beta{0.f}; - std::shared_ptr> eltwise_injector_sse42; - std::shared_ptr> eltwise_injector_avx2; - std::shared_ptr> eltwise_injector_avx512_core; + std::shared_ptr> + eltwise_injector_sse42; + std::shared_ptr> + eltwise_injector_avx2; + std::shared_ptr> + eltwise_injector_avx512_core; private: size_t get_inputs_num() const override; @@ -44,12 +55,15 @@ class jit_dnnl_emitter : public jit_emitter { class jit_dnnl_aux_emitter : public jit_dnnl_emitter { public: - jit_dnnl_aux_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - dnnl_alg_kind_t algKind, float inpAlpha, float inpBeta, - ov::element::Type exec_prc = ov::element::f32); + jit_dnnl_aux_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + dnnl_alg_kind_t algKind, + float inpAlpha, + float inpBeta, + ov::element::Type exec_prc = ov::element::f32); private: }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp index 7a4d1e31277e3b..0b7396b6fcd830 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_dnnl_ext_emitters.hpp @@ -4,9 +4,9 @@ #pragma once +#include "jit_dnnl_emitters.hpp" #include "openvino/opsets/opset5.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" -#include "jit_dnnl_emitters.hpp" #include "utils/ngraph_utils.hpp" namespace ov { @@ -14,88 +14,102 @@ namespace intel_cpu { class jit_relu_emitter : public jit_dnnl_emitter { public: - jit_relu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_relu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_relu; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_relu; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_sigmoid_emitter : public jit_dnnl_emitter { public: - jit_sigmoid_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_sigmoid_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_logistic; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_logistic; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_tanh_emitter : public jit_dnnl_emitter { public: - jit_tanh_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_tanh_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_tanh; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_tanh; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_elu_emitter : public jit_dnnl_emitter { public: - jit_elu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_elu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_elu; - alpha = ov::as_type_ptr(n)->get_alpha(); - beta = 0.f; + kind = dnnl_eltwise_elu; + alpha = ov::as_type_ptr(n)->get_alpha(); + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_abs_emitter : public jit_dnnl_emitter { public: - jit_abs_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_abs_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_abs; - alpha = 0.f; - beta = 0.f; + kind = dnnl_eltwise_abs; + alpha = 0.f; + beta = 0.f; - set_injector(); - } + set_injector(); + } }; class jit_clamp_emitter : public jit_dnnl_emitter { public: - jit_clamp_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) + jit_clamp_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { - kind = dnnl_eltwise_clip; - auto op = ov::as_type_ptr(n); - alpha = op->get_min(); - beta = op->get_max(); + kind = dnnl_eltwise_clip; + auto op = ov::as_type_ptr(n); + alpha = op->get_min(); + beta = op->get_max(); - set_injector(); - } + set_injector(); + } }; class jit_swish_emitter : public jit_dnnl_emitter { public: - jit_swish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + jit_swish_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { kind = dnnl_eltwise_swish; auto op = ov::as_type_ptr(n); alpha = op->get_alpha(); @@ -107,9 +121,11 @@ class jit_swish_emitter : public jit_dnnl_emitter { class jit_hswish_emitter : public jit_dnnl_emitter { public: - jit_hswish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + jit_hswish_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { // since v3.0 oneDNN has flexible version of hardswish, ov still uses the one with hardcoded alpha and beta kind = dnnl_eltwise_hardswish; alpha = 1.f / 6.f; @@ -121,9 +137,11 @@ class jit_hswish_emitter : public jit_dnnl_emitter { class jit_gelu_v0_emitter : public jit_dnnl_emitter { public: - jit_gelu_v0_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_gelu_v0_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { kind = dnnl_eltwise_gelu_erf; set_injector(); @@ -132,9 +150,11 @@ class jit_gelu_v0_emitter : public jit_dnnl_emitter { class jit_gelu_v7_emitter : public jit_dnnl_emitter { public: - jit_gelu_v7_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_gelu_v7_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32) - : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { auto gelu = getNgraphOpAs(n); ov::op::GeluApproximationMode approximationMode = gelu->get_approximation_mode(); if (approximationMode == ov::op::GeluApproximationMode::ERF) @@ -152,11 +172,11 @@ class jit_gelu_v7_emitter : public jit_dnnl_emitter { class jit_round_emitter : public jit_dnnl_emitter { public: - jit_round_emitter( - dnnl::impl::cpu::x64::jit_generator *host, - dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32) : jit_dnnl_emitter(host, host_isa, n, exec_prc) { + jit_round_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32) + : jit_dnnl_emitter(host, host_isa, n, exec_prc) { const auto round = getNgraphOpAs(n); const auto mode = round->get_mode(); if ((mode != ov::opset5::Round::RoundMode::HALF_AWAY_FROM_ZERO) && @@ -165,12 +185,11 @@ class jit_round_emitter : public jit_dnnl_emitter { static_cast(mode)); } - kind = mode == ov::opset5::Round::RoundMode::HALF_AWAY_FROM_ZERO ? - dnnl_eltwise_round_half_away_from_zero : - dnnl_eltwise_round_half_to_even; + kind = mode == ov::opset5::Round::RoundMode::HALF_AWAY_FROM_ZERO ? dnnl_eltwise_round_half_away_from_zero + : dnnl_eltwise_round_half_to_even; set_injector(); } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index 0331a3ee4908b9..7a091fc946c2d8 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -8,8 +8,8 @@ using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu; using namespace Xbyak; -#define CONST_1_F 0x3f800000 // 1.f -#define INF_MASK 0x7F800000 +#define CONST_1_F 0x3f800000 // 1.f +#define INF_MASK 0x7F800000 #define INF_NEG_MASK 0xFF800000 namespace ov { @@ -22,23 +22,30 @@ ov::element::Type get_arithmetic_binary_exec_precision(const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_add_emitter::jit_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_add_emitter::jit_add_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_add_emitter::jit_add_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_add_emitter::get_inputs_num() const { return 2; } +size_t jit_add_emitter::get_inputs_num() const { + return 2; +} -void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -51,7 +58,7 @@ void jit_add_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -59,9 +66,14 @@ void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std auto uni_vadd = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vaddps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -78,14 +90,19 @@ std::set> jit_add_emitter::get_supported_precisions(c } /// MUL_ADD /// -jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_mul_add_emitter::jit_mul_add_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mul_add_emitter::get_inputs_num() const { return 3; } +size_t jit_mul_add_emitter::get_inputs_num() const { + return 3; +} -void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -98,7 +115,8 @@ void jit_mul_add_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mul_add_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -109,47 +127,49 @@ void jit_mul_add_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vfmadd231_xmm = [this](Xmm vmm_dst, Xmm vmm_src0, Xmm vmm_src1, Xmm vmm_src2) { h->uni_vmovups(vmm_dst, vmm_src0); switch (exec_prc_) { - case ov::element::f32: { - h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1); - h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2); - } break; - case ov::element::i32: { - h->uni_vpmulld(vmm_dst, vmm_dst, vmm_src1); - h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); - } break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1); + h->uni_vaddps(vmm_dst, vmm_dst, vmm_src2); + } break; + case ov::element::i32: { + h->uni_vpmulld(vmm_dst, vmm_dst, vmm_src1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); + } break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; auto uni_vfmadd231_vmm = [this, vmm_aux0](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1, Vmm vmm_src2) { switch (exec_prc_) { - case ov::element::f32: { - Vmm vmm_mul0; - if (vmm_dst.getIdx() == vmm_src0.getIdx()) { - h->uni_vmovups(vmm_aux0, vmm_src0); - vmm_mul0 = vmm_aux0; - } else { - vmm_mul0 = vmm_src0; - } - - Vmm vmm_mul1; - if (vmm_dst.getIdx() == vmm_src1.getIdx()) { - h->uni_vmovups(vmm_aux0, vmm_src1); - vmm_mul1 = vmm_aux0; - } else { - vmm_mul1 = vmm_src1; - } - - if (vmm_dst.getIdx() != vmm_src2.getIdx()) - h->uni_vmovups(vmm_dst, vmm_src2); - - h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1); - } break; - case ov::element::i32: { - h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); - h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); - } break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + Vmm vmm_mul0; + if (vmm_dst.getIdx() == vmm_src0.getIdx()) { + h->uni_vmovups(vmm_aux0, vmm_src0); + vmm_mul0 = vmm_aux0; + } else { + vmm_mul0 = vmm_src0; + } + + Vmm vmm_mul1; + if (vmm_dst.getIdx() == vmm_src1.getIdx()) { + h->uni_vmovups(vmm_aux0, vmm_src1); + vmm_mul1 = vmm_aux0; + } else { + vmm_mul1 = vmm_src1; + } + + if (vmm_dst.getIdx() != vmm_src2.getIdx()) + h->uni_vmovups(vmm_dst, vmm_src2); + + h->uni_vfmadd231ps(vmm_dst, vmm_mul0, vmm_mul1); + } break; + case ov::element::i32: { + h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); + h->uni_vpaddd(vmm_dst, vmm_dst, vmm_src2); + } break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -164,19 +184,27 @@ size_t jit_mul_add_emitter::aux_vecs_count() const { return 1; } -std::set> jit_mul_add_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_mul_add_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}, {element::i32, element::i32, element::i32}}; } /// SUB /// -jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_subtract_emitter::jit_subtract_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_subtract_emitter::get_inputs_num() const { return 2; } +size_t jit_subtract_emitter::get_inputs_num() const { + return 2; +} -void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -189,7 +217,8 @@ void jit_subtract_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -197,9 +226,14 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons auto uni_vsub = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -211,19 +245,27 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons } } -std::set> jit_subtract_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_subtract_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// MULTIPLY /// -jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_multiply_emitter::jit_multiply_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_multiply_emitter::get_inputs_num() const { return 2; } +size_t jit_multiply_emitter::get_inputs_num() const { + return 2; +} -void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -236,7 +278,8 @@ void jit_multiply_emitter::emit_impl(const std::vector &in_vec_idxs, con } template -void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_multiply_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -244,9 +287,14 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons auto uni_vmul = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vmulps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpmulld(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -258,19 +306,26 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons } } -std::set> jit_multiply_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_multiply_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// DIVIDE /// -jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_divide_emitter::jit_divide_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_divide_emitter::jit_divide_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_divide_emitter::jit_divide_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_divide_emitter::get_inputs_num() const { return 2; } +size_t jit_divide_emitter::get_inputs_num() const { + return 2; +} -void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -283,7 +338,8 @@ void jit_divide_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_divide_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -291,23 +347,24 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vdiv = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: { - h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1); - break; - } - case ov::element::i32: { - Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); - - // The opset doesn't contain vector instruction for integer divide operation - // As WA we emulate its behavior via fp divide followed by rounding to zero - h->uni_vcvtdq2ps(vmm_dst, vmm_src0); - h->uni_vcvtdq2ps(vmm_aux0, vmm_src1); - h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0); - h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero - h->uni_vcvtps2dq(vmm_dst, vmm_dst); - break; - } - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1); + break; + } + case ov::element::i32: { + Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]); + + // The opset doesn't contain vector instruction for integer divide operation + // As WA we emulate its behavior via fp divide followed by rounding to zero + h->uni_vcvtdq2ps(vmm_dst, vmm_src0); + h->uni_vcvtdq2ps(vmm_aux0, vmm_src1); + h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0); + h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero + h->uni_vcvtps2dq(vmm_dst, vmm_dst); + break; + } + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -319,7 +376,8 @@ void jit_divide_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set> jit_divide_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_divide_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } @@ -328,18 +386,25 @@ size_t jit_divide_emitter::aux_vecs_count() const { } /// FLOOR /// -jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_floor_emitter::jit_floor_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_floor_emitter::jit_floor_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_floor_emitter::jit_floor_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_floor_emitter::get_inputs_num() const { return 1; } +size_t jit_floor_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_floor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -352,7 +417,8 @@ void jit_floor_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -360,14 +426,20 @@ void jit_floor_emitter::emit_isa(const std::vector &in_vec_idxs, const s } /// CEILING /// -jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_ceiling_emitter::jit_ceiling_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_ceiling_emitter::get_inputs_num() const { return 1; } +size_t jit_ceiling_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_ceiling_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_ceiling_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } @@ -385,7 +457,8 @@ void jit_ceiling_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_ceiling_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_ceiling_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -403,13 +476,17 @@ jit_floor_mod_emitter::jit_floor_mod_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_floor_mod_emitter::get_inputs_num() const { return 2; } +size_t jit_floor_mod_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_floor_mod_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -422,7 +499,8 @@ void jit_floor_mod_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_floor_mod_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -434,14 +512,14 @@ void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, con h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vmovups(vmm_aux0, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down + h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } else { if (vmm_dst.getIdx() != vmm_src0.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down + h->uni_vroundps(vmm_aux0, vmm_aux0, 1); // rounding down h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } @@ -452,12 +530,17 @@ size_t jit_floor_mod_emitter::aux_vecs_count() const { } /// MOD /// -jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_mod_emitter::jit_mod_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_mod_emitter::jit_mod_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_mod_emitter::jit_mod_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_mod_emitter::get_inputs_num() const { return 2; } +size_t jit_mod_emitter::get_inputs_num() const { + return 2; +} std::set> jit_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; @@ -476,7 +559,7 @@ void jit_mod_emitter::emit_impl(const std::vector& in_vec_idxs, const st } template -void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_mod_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -488,14 +571,14 @@ void jit_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vmovups(vmm_aux0, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate + h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } else { if (vmm_dst.getIdx() != vmm_src0.getIdx()) h->uni_vmovups(vmm_dst, vmm_src0); h->uni_vdivps(vmm_aux0, vmm_src0, vmm_src1); - h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate + h->uni_vroundps(vmm_aux0, vmm_aux0, 3); // truncate h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src1); h->uni_vsubps(vmm_dst, vmm_dst, vmm_aux0); } @@ -506,14 +589,19 @@ size_t jit_mod_emitter::aux_vecs_count() const { } /// MAXIMUM /// -jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_maximum_emitter::jit_maximum_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_maximum_emitter::get_inputs_num() const { return 2; } +size_t jit_maximum_emitter::get_inputs_num() const { + return 2; +} -void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -526,7 +614,8 @@ void jit_maximum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_maximum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -534,9 +623,14 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vmax = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vmaxps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpmaxsd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -549,19 +643,25 @@ void jit_maximum_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set> jit_maximum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_maximum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// MINIMUM /// -jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node) -: jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} -jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, get_arithmetic_binary_exec_precision(node)) {} +jit_minimum_emitter::jit_minimum_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_minimum_emitter::get_inputs_num() const { return 2; } +size_t jit_minimum_emitter::get_inputs_num() const { + return 2; +} -void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -574,7 +674,8 @@ void jit_minimum_emitter::emit_impl(const std::vector &in_vec_idxs, cons } template -void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_minimum_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -582,9 +683,14 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const auto uni_vmin = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); break; - case ov::element::i32: h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: + h->uni_vminps(vmm_dst, vmm_src0, vmm_src1); + break; + case ov::element::i32: + h->uni_vpminsd(vmm_dst, vmm_src0, vmm_src1); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -597,20 +703,28 @@ void jit_minimum_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -std::set> jit_minimum_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_minimum_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// SQUARED_DIFFERENCE /// -jit_squared_difference_emitter::jit_squared_difference_emitter( - x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} -jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) {} +jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} +jit_squared_difference_emitter::jit_squared_difference_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_squared_difference_emitter::get_inputs_num() const { return 2; } +size_t jit_squared_difference_emitter::get_inputs_num() const { + return 2; +} -void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_squared_difference_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -623,7 +737,8 @@ void jit_squared_difference_emitter::emit_impl(const std::vector &in_vec } template -void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_squared_difference_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -631,15 +746,16 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ auto uni_vsqdiff = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) { switch (exec_prc_) { - case ov::element::f32: { - h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); - h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); - } break; - case ov::element::i32: { - h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); - h->uni_vpmulld(vmm_dst, vmm_dst, vmm_dst); - } break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); + case ov::element::f32: { + h->uni_vsubps(vmm_dst, vmm_src0, vmm_src1); + h->uni_vmulps(vmm_dst, vmm_dst, vmm_dst); + } break; + case ov::element::i32: { + h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); + h->uni_vpmulld(vmm_dst, vmm_dst, vmm_dst); + } break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } }; @@ -652,24 +768,33 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ } } -std::set> jit_squared_difference_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_squared_difference_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}, {element::i32, element::i32}}; } /// POWER_DYNAMIC /// -jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, +jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_power_dynamic_emitter::jit_power_dynamic_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_power_dynamic_emitter::get_inputs_num() const { return 2; } +size_t jit_power_dynamic_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_power_dynamic_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_dynamic_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_power_dynamic_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_power_dynamic_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -682,7 +807,8 @@ void jit_power_dynamic_emitter::emit_impl(const std::vector& in_vec_idxs } template -void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_power_dynamic_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -692,8 +818,8 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, // caller obligation to save gprs as callee may use them size_t gpr_size = 8; - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, - h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + Xbyak::Operand gprs_to_save[] = + {h->r8, h->r9, h->r10, h->r11, h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -722,8 +848,8 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); - h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src - h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta + h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_src0); // src + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_src1); // beta // save function address in gpr to pass in in call instruction h->mov(h->rbp, reinterpret_cast(powf)); @@ -735,7 +861,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, // Take src, apply powf on it and replace value on a stack with dst. for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) { - const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; + const Address& source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; h->uni_vmovss(xmm0, source); h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]); h->call(h->rbp); @@ -767,24 +893,30 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector &in_vec_idxs, h->add(h->rsp, n_gprs_to_save * gpr_size); } - /// EQUAL /// -jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_equal_emitter::jit_equal_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_equal_emitter::jit_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_equal_emitter::jit_equal_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -797,7 +929,8 @@ void jit_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -846,13 +979,17 @@ jit_not_equal_emitter::jit_not_equal_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_not_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_not_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_not_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_not_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -865,7 +1002,8 @@ void jit_not_equal_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_not_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_not_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -900,22 +1038,29 @@ size_t jit_not_equal_emitter::aux_vecs_count() const { } /// GREATER /// -jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_greater_emitter::jit_greater_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_greater_emitter::jit_greater_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_greater_emitter::jit_greater_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_greater_emitter::get_inputs_num() const { return 2; } +size_t jit_greater_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_greater_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -928,7 +1073,8 @@ void jit_greater_emitter::emit_impl(const std::vector& in_vec_idxs, cons } template -void jit_greater_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -963,23 +1109,31 @@ size_t jit_greater_emitter::aux_vecs_count() const { } /// GREATER_EQUAL /// -jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, +jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_greater_equal_emitter::jit_greater_equal_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_greater_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_greater_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_greater_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_greater_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -992,7 +1146,8 @@ void jit_greater_equal_emitter::emit_impl(const std::vector& in_vec_idxs } template -void jit_greater_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_greater_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1027,22 +1182,28 @@ size_t jit_greater_equal_emitter::aux_vecs_count() const { } /// LESS /// -jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_less_emitter::jit_less_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_less_emitter::jit_less_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc) { +jit_less_emitter::jit_less_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_less_emitter::get_inputs_num() const { return 2; } +size_t jit_less_emitter::get_inputs_num() const { + return 2; +} std::set> jit_less_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1055,7 +1216,7 @@ void jit_less_emitter::emit_impl(const std::vector& in_vec_idxs, const s } template -void jit_less_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1104,13 +1265,17 @@ jit_less_equal_emitter::jit_less_equal_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_less_equal_emitter::get_inputs_num() const { return 2; } +size_t jit_less_equal_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_less_equal_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_less_equal_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1123,7 +1288,8 @@ void jit_less_equal_emitter::emit_impl(const std::vector& in_vec_idxs, c } template -void jit_less_equal_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_less_equal_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1173,13 +1339,17 @@ jit_logical_and_emitter::jit_logical_and_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_and_emitter::get_inputs_num() const { return 2; } +size_t jit_logical_and_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_logical_and_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_and_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1192,7 +1362,8 @@ void jit_logical_and_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_logical_and_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_and_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1261,13 +1432,17 @@ jit_logical_or_emitter::jit_logical_or_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_or_emitter::get_inputs_num() const { return 2; } +size_t jit_logical_or_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_logical_or_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_or_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1280,7 +1455,8 @@ void jit_logical_or_emitter::emit_impl(const std::vector& in_vec_idxs, c } template -void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_or_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1349,13 +1525,17 @@ jit_logical_xor_emitter::jit_logical_xor_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_xor_emitter::get_inputs_num() const { return 2; } +size_t jit_logical_xor_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_logical_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_xor_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1368,7 +1548,8 @@ void jit_logical_xor_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_logical_xor_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_xor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1437,13 +1618,17 @@ jit_logical_not_emitter::jit_logical_not_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_logical_not_emitter::get_inputs_num() const { return 1; } +size_t jit_logical_not_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_logical_not_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_logical_not_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1456,7 +1641,8 @@ void jit_logical_not_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_logical_not_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_logical_not_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1507,20 +1693,30 @@ jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator* host, prepare_table(); } -jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator *host, x64::cpu_isa_t host_isa, - float inpPower, float inpScale, float inpShift, +jit_power_static_emitter::jit_power_static_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + float inpPower, + float inpScale, + float inpShift, ov::element::Type exec_prc) -: jit_emitter(host, host_isa, exec_prc), power(inpPower), scale(inpScale), shift(inpShift) { + : jit_emitter(host, host_isa, exec_prc), + power(inpPower), + scale(inpScale), + shift(inpShift) { prepare_table(); } -size_t jit_power_static_emitter::get_inputs_num() const { return 1; } +size_t jit_power_static_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_power_static_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_power_static_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1533,7 +1729,8 @@ void jit_power_static_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_power_static_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1600,8 +1797,8 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, // caller obligation to save gprs as callee may use them size_t gpr_size = 8; - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->rax, - h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; + Xbyak::Operand gprs_to_save[] = + {h->r8, h->r9, h->r10, h->r11, h->rax, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp, h->rbx}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -1630,8 +1827,8 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, h->sub(h->rsp, (get_max_vecs_count() + 2) * get_vec_length()); for (size_t i = 2; i < get_max_vecs_count() + 2; ++i) h->uni_vmovups(h->ptr[h->rsp + i * get_vec_length()], Vmm(i - 2)); - h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src - h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta + h->uni_vmovups(h->ptr[h->rsp + 0 * get_vec_length()], vmm_dst); // src + h->uni_vmovups(h->ptr[h->rsp + 1 * get_vec_length()], vmm_aux0); // beta // save function address in gpr to pass in in call instruction h->mov(h->rbp, reinterpret_cast(powf)); @@ -1643,7 +1840,7 @@ void jit_power_static_emitter::emit_isa(const std::vector &in_vec_idxs, // Take src, apply powf on it and replace value on a stack with dst. for (size_t i = 0; i < get_vec_length() / sizeof(float); ++i) { - const Address &source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; + const Address& source = h->ptr[h->rsp + h->rbx + i * sizeof(float)]; h->uni_vmovss(xmm0, source); h->uni_vmovss(xmm1, h->ptr[h->rsp + h->rbx + get_vec_length() + i * sizeof(float)]); h->call(h->rbp); @@ -1680,7 +1877,7 @@ void jit_power_static_emitter::register_table_entries() { push_arg_entry_of("power", x64::float2int(power), true); push_arg_entry_of("scale", x64::float2int(scale), true); push_arg_entry_of("shift", x64::float2int(shift), true); - push_arg_entry_of("one", x64::float2int(1.f), true); + push_arg_entry_of("one", x64::float2int(1.f), true); } size_t jit_power_static_emitter::aux_vecs_count() const { @@ -1699,13 +1896,17 @@ jit_prelu_emitter::jit_prelu_emitter(x64::jit_generator* host, x64::cpu_isa_t ho : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_prelu_emitter::get_inputs_num() const { return 2; } +size_t jit_prelu_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_prelu_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_prelu_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32}}; } -void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1718,7 +1919,8 @@ void jit_prelu_emitter::emit_impl(const std::vector& in_vec_idxs, const } template -void jit_prelu_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_prelu_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -1761,13 +1963,16 @@ jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator* host, jit_sqrt_emitter::jit_sqrt_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_sqrt_emitter::get_inputs_num() const { return 1; } +size_t jit_sqrt_emitter::get_inputs_num() const { + return 1; +} std::set> jit_sqrt_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1780,12 +1985,12 @@ void jit_sqrt_emitter::emit_impl(const std::vector& in_vec_idxs, const s } template -void jit_sqrt_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_sqrt_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); - h->uni_vsqrtps(vmm_dst, vmm_src0); + h->uni_vsqrtps(vmm_dst, vmm_src0); } /// Negate /// @@ -1795,13 +2000,17 @@ jit_negative_emitter::jit_negative_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_negative_emitter::get_inputs_num() const { return 1; } +size_t jit_negative_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_negative_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_negative_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_negative_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_negative_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1814,33 +2023,38 @@ void jit_negative_emitter::emit_impl(const std::vector& in_vec_idxs, con } template -void jit_negative_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_negative_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); - Vmm vmm_dst = Vmm(out_vec_idxs[0]); + Vmm vmm_dst = Vmm(out_vec_idxs[0]); h->uni_vpxor(vmm_dst, vmm_dst, vmm_dst); h->uni_vsubps(vmm_dst, vmm_dst, vmm_src); } - /// EXP /// jit_exp_emitter::jit_exp_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -jit_exp_emitter::jit_exp_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) +jit_exp_emitter::jit_exp_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_exp_emitter::get_inputs_num() const { return 1; } +size_t jit_exp_emitter::get_inputs_num() const { + return 1; +} std::set> jit_exp_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1853,7 +2067,7 @@ void jit_exp_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_exp_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1862,7 +2076,7 @@ void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std Vmm vmm_aux0 = Vmm(aux_vec_idxs[0 + static_cast(need_vmm_mask())]); Vmm vmm_aux1 = Vmm(aux_vec_idxs[1 + static_cast(need_vmm_mask())]); - auto compute_cmp_mask = [&](const Vmm &vmm_src, const Xbyak::Operand &compare_operand, int cmp_predicate) { + auto compute_cmp_mask = [&](const Vmm& vmm_src, const Xbyak::Operand& compare_operand, int cmp_predicate) { if (host_isa_ == x64::avx512_core) { h->vcmpps(k_mask, vmm_src, compare_operand, cmp_predicate); } else { @@ -1870,7 +2084,7 @@ void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std } }; - auto blend_with_mask = [&](const Vmm &vmm_dst, const Xbyak::Operand &src) { + auto blend_with_mask = [&](const Vmm& vmm_dst, const Xbyak::Operand& src) { if (host_isa_ == x64::avx512_core) { h->vblendmps(vmm_dst | k_mask, vmm_dst, src); } else { @@ -1924,11 +2138,11 @@ void jit_exp_emitter::emit_isa(const std::vector &in_vec_idxs, const std } void jit_exp_emitter::register_table_entries() { - push_arg_entry_of("pol1", 0x3f7ffffb, true); // p1 = 0.999999701f - push_arg_entry_of("pol2", 0x3efffee3, true); // p2 = 0.499991506f - push_arg_entry_of("pol3", 0x3e2aad40, true); // p3 = 0.166676521f - push_arg_entry_of("pol4", 0x3d2b9d0d, true); // p4 = 0.0418978221f - push_arg_entry_of("pol5", 0x3c07cfce, true); // p5 = 0.00828929059f + push_arg_entry_of("pol1", 0x3f7ffffb, true); // p1 = 0.999999701f + push_arg_entry_of("pol2", 0x3efffee3, true); // p2 = 0.499991506f + push_arg_entry_of("pol3", 0x3e2aad40, true); // p3 = 0.166676521f + push_arg_entry_of("pol4", 0x3d2b9d0d, true); // p4 = 0.0418978221f + push_arg_entry_of("pol5", 0x3c07cfce, true); // p5 = 0.00828929059f push_arg_entry_of("one", CONST_1_F, true); push_arg_entry_of("half", 0x3f000000, true); @@ -1950,16 +2164,21 @@ jit_erf_emitter::jit_erf_emitter(x64::jit_generator* host, x64::cpu_isa_t host_i prepare_table(); } -jit_erf_emitter::jit_erf_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, const std::shared_ptr& node, ov::element::Type exec_prc) +jit_erf_emitter::jit_erf_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + const std::shared_ptr& node, + ov::element::Type exec_prc) : jit_erf_emitter(host, host_isa, exec_prc) {} -size_t jit_erf_emitter::get_inputs_num() const { return 1; } +size_t jit_erf_emitter::get_inputs_num() const { + return 1; +} std::set> jit_erf_emitter::get_supported_precisions(const std::shared_ptr& node) { return {{element::f32}}; } -void jit_erf_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_erf_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -1972,7 +2191,7 @@ void jit_erf_emitter::emit_impl(const std::vector &in_vec_idxs, const st } template -void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_erf_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -1991,8 +2210,11 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std // pass the current `aux_vec_idxs` to `exp_emitter` excepting `vmm_aux3` auto exp_aux_vec_idxs = aux_vec_idxs; - exp_aux_vec_idxs.erase(std::find(exp_aux_vec_idxs.begin(), exp_aux_vec_idxs.end(), static_cast(vmm_aux3.getIdx()))); - m_exp_emitter->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(vmm_dst.getIdx())}, exp_aux_vec_idxs); + exp_aux_vec_idxs.erase( + std::find(exp_aux_vec_idxs.begin(), exp_aux_vec_idxs.end(), static_cast(vmm_aux3.getIdx()))); + m_exp_emitter->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(vmm_dst.getIdx())}, + exp_aux_vec_idxs); h->uni_vxorps(vmm_dst, vmm_dst, table_val("sign_mask")); @@ -2027,16 +2249,16 @@ void jit_erf_emitter::emit_isa(const std::vector &in_vec_idxs, const std } void jit_erf_emitter::register_table_entries() { - push_arg_entry_of("approx_const", 0x3ea7ba05, true); // 0.3275911 + push_arg_entry_of("approx_const", 0x3ea7ba05, true); // 0.3275911 push_arg_entry_of("one", CONST_1_F, true); push_arg_entry_of("sign_mask", 0x80000000, true); push_arg_entry_of("positive_mask", 0x7fffffff, true); - push_arg_entry_of("pol1", 0x3e827906, true); // p1 = 0.254829592f - push_arg_entry_of("pol2", 0xbe91a98e, true); // p2 = -0.284496736f - push_arg_entry_of("pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f - push_arg_entry_of("pol4", 0xbfba00e3, true); // p4 = -1.453152027f - push_arg_entry_of("pol5", 0x3f87dc22, true); // p5 = 1.061405429f + push_arg_entry_of("pol1", 0x3e827906, true); // p1 = 0.254829592f + push_arg_entry_of("pol2", 0xbe91a98e, true); // p2 = -0.284496736f + push_arg_entry_of("pol3", 0x3fb5f0e3, true); // p3 = 1.421413741f + push_arg_entry_of("pol4", 0xbfba00e3, true); // p4 = -1.453152027f + push_arg_entry_of("pol5", 0x3f87dc22, true); // p5 = 1.061405429f } size_t jit_erf_emitter::aux_vecs_count() const { @@ -2063,13 +2285,17 @@ jit_soft_sign_emitter::jit_soft_sign_emitter(x64::jit_generator* host, prepare_table(); } -size_t jit_soft_sign_emitter::get_inputs_num() const { return 1; } +size_t jit_soft_sign_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_soft_sign_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_soft_sign_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32}}; } -void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2082,7 +2308,8 @@ void jit_soft_sign_emitter::emit_impl(const std::vector& in_vec_idxs, co } template -void jit_soft_sign_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_soft_sign_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -2100,10 +2327,11 @@ void jit_soft_sign_emitter::register_table_entries() { /// IS_FINITE /// template <> -void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { auto vmm_src = Zmm(in_vec_idxs[0]); auto vmm_dst = Zmm(out_vec_idxs[0]); - auto &ones_mask = h->k1; + auto& ones_mask = h->k1; auto reg32_one = Reg32(aux_gpr_idxs[0]); h->mov(reg32_one, CONST_1_F); @@ -2113,13 +2341,14 @@ void jit_is_finite_emitter::emit_isa(const std::vector } template -void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional::type; auto vmm_src = Vmm(in_vec_idxs[0]); auto vmm_dst = Vmm(out_vec_idxs[0]); h->uni_vandps(vmm_src, vmm_src, table_val("inf")); - h->uni_vcmpps(vmm_src, vmm_src, table_val("inf"), 0B00000100); // NEq + h->uni_vcmpps(vmm_src, vmm_src, table_val("inf"), 0B00000100); // NEq if (isa == x64::avx2) { h->uni_vandps(vmm_dst, vmm_src, table_val("one")); @@ -2131,7 +2360,8 @@ void jit_is_finite_emitter::emit_isa(const std::vector &in_vec_idxs, con } } -void jit_is_finite_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_finite_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2152,12 +2382,13 @@ void jit_is_finite_emitter::register_table_entries() { /// IS_INF /// template <> -void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_inf_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { Zmm vmm_src = Zmm(in_vec_idxs[0]); Zmm vmm_dst = Zmm(out_vec_idxs[0]); if (detect_negative || detect_positive) { - auto &ones_mask = h->k1; + auto& ones_mask = h->k1; auto reg32_one = Reg32(aux_gpr_idxs[0]); uint8_t imm = detect_negative ? 0B00010000 : 0B00000000; if (detect_positive) { @@ -2173,7 +2404,8 @@ void jit_is_inf_emitter::emit_isa(const std::vector &i } template -void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_inf_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional::type; if (detect_negative || detect_positive) { @@ -2204,7 +2436,8 @@ void jit_is_inf_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -void jit_is_inf_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_inf_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2226,10 +2459,11 @@ void jit_is_inf_emitter::register_table_entries() { /// IS_NAN /// template <> -void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { auto vmm_src = Zmm(in_vec_idxs[0]); auto vmm_dst = Zmm(out_vec_idxs[0]); - auto &ones_mask = h->k1; + auto& ones_mask = h->k1; auto reg32_one = Reg32(aux_gpr_idxs[0]); h->mov(reg32_one, CONST_1_F); @@ -2238,7 +2472,8 @@ void jit_is_nan_emitter::emit_isa(const std::vector &i } template -void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional::type; auto vmm_src = Vmm(in_vec_idxs[0]); auto vmm_dst = Vmm(out_vec_idxs[0]); @@ -2254,7 +2489,8 @@ void jit_is_nan_emitter::emit_isa(const std::vector &in_vec_idxs, const } } -void jit_is_nan_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_is_nan_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::avx512_core) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2281,9 +2517,12 @@ jit_select_emitter::jit_select_emitter(x64::jit_generator* host, jit_select_emitter::jit_select_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_select_emitter::get_inputs_num() const { return 3; } +size_t jit_select_emitter::get_inputs_num() const { + return 3; +} -std::set> jit_select_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_select_emitter::get_supported_precisions( + const std::shared_ptr& node) { return {{element::f32, element::f32, element::f32}}; } @@ -2292,11 +2531,12 @@ size_t jit_select_emitter::aux_vecs_count() const { return 0; else if (host_isa_ == x64::avx2) // tmp vec for mask return 1; - else // mask should be xmm0 on sse41 + tmp vec for mask + else // mask should be xmm0 on sse41 + tmp vec for mask return 2; } -void jit_select_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_select_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2309,7 +2549,8 @@ void jit_select_emitter::emit_impl(const std::vector &in_vec_idxs, const } template -void jit_select_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { +void jit_select_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_cond = Vmm(in_vec_idxs[0]); Vmm vmm_src0 = Vmm(in_vec_idxs[1]); @@ -2346,20 +2587,22 @@ jit_bitwise_and_emitter::jit_bitwise_and_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_bitwise_and_emitter::jit_bitwise_and_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_and_emitter::jit_bitwise_and_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_bitwise_and_emitter::get_inputs_num() const { return 2; } +size_t jit_bitwise_and_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_bitwise_and_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8, element::i8}, - {element::u8, element::u8}, - {element::i32, element::i32} - }; +std::set> jit_bitwise_and_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8, element::i8}, {element::u8, element::u8}, {element::i32, element::i32}}; } -void jit_bitwise_and_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_and_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2372,7 +2615,8 @@ void jit_bitwise_and_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_bitwise_and_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_and_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -2399,24 +2643,28 @@ jit_bitwise_not_emitter::jit_bitwise_not_emitter(x64::jit_generator* host, prepare_table(); } -jit_bitwise_not_emitter::jit_bitwise_not_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_not_emitter::jit_bitwise_not_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } -size_t jit_bitwise_not_emitter::get_inputs_num() const { return 1; } +size_t jit_bitwise_not_emitter::get_inputs_num() const { + return 1; +} -std::set> jit_bitwise_not_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8}, - {element::u8}, - {element::i32} - }; +std::set> jit_bitwise_not_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8}, {element::u8}, {element::i32}}; } -size_t jit_bitwise_not_emitter::aux_vecs_count() const { return 1; } +size_t jit_bitwise_not_emitter::aux_vecs_count() const { + return 1; +} -void jit_bitwise_not_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_not_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2429,7 +2677,8 @@ void jit_bitwise_not_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_bitwise_not_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_not_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src = Vmm(in_vec_idxs[0]); Vmm vmm_dst = Vmm(out_vec_idxs[0]); @@ -2457,20 +2706,22 @@ jit_bitwise_or_emitter::jit_bitwise_or_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_bitwise_or_emitter::jit_bitwise_or_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_or_emitter::jit_bitwise_or_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_bitwise_or_emitter::get_inputs_num() const { return 2; } +size_t jit_bitwise_or_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_bitwise_or_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8, element::i8}, - {element::u8, element::u8}, - {element::i32, element::i32} - }; +std::set> jit_bitwise_or_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8, element::i8}, {element::u8, element::u8}, {element::i32, element::i32}}; } -void jit_bitwise_or_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_or_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2483,7 +2734,8 @@ void jit_bitwise_or_emitter::emit_impl(const std::vector& in_vec_idxs, c } template -void jit_bitwise_or_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_or_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -2508,20 +2760,22 @@ jit_bitwise_xor_emitter::jit_bitwise_xor_emitter(x64::jit_generator* host, ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -jit_bitwise_xor_emitter::jit_bitwise_xor_emitter(x64::jit_generator* host, x64::cpu_isa_t host_isa, ov::element::Type exec_prc) +jit_bitwise_xor_emitter::jit_bitwise_xor_emitter(x64::jit_generator* host, + x64::cpu_isa_t host_isa, + ov::element::Type exec_prc) : jit_emitter(host, host_isa, exec_prc) {} -size_t jit_bitwise_xor_emitter::get_inputs_num() const { return 2; } +size_t jit_bitwise_xor_emitter::get_inputs_num() const { + return 2; +} -std::set> jit_bitwise_xor_emitter::get_supported_precisions(const std::shared_ptr& node) { - return { - {element::i8, element::i8}, - {element::u8, element::u8}, - {element::i32, element::i32} - }; +std::set> jit_bitwise_xor_emitter::get_supported_precisions( + const std::shared_ptr& node) { + return {{element::i8, element::i8}, {element::u8, element::u8}, {element::i32, element::i32}}; } -void jit_bitwise_xor_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_xor_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { if (host_isa_ == x64::sse41) { emit_isa(in_vec_idxs, out_vec_idxs); } else if (host_isa_ == x64::avx2) { @@ -2534,7 +2788,8 @@ void jit_bitwise_xor_emitter::emit_impl(const std::vector& in_vec_idxs, } template -void jit_bitwise_xor_emitter::emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_bitwise_xor_emitter::emit_isa(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { using Vmm = typename conditional3::type; Vmm vmm_src0 = Vmm(in_vec_idxs[0]); Vmm vmm_src1 = Vmm(in_vec_idxs[1]); @@ -2543,5 +2798,5 @@ void jit_bitwise_xor_emitter::emit_isa(const std::vector& in_vec_idxs, c h->uni_vxorps(vmm_dst, vmm_src0, vmm_src1); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp index c8c4b06d6f3347..84c65d44a12280 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.hpp @@ -11,418 +11,488 @@ namespace intel_cpu { class jit_add_emitter : public jit_emitter { public: - jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_mul_add_emitter : public jit_emitter { public: - jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_mul_add_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; - class jit_subtract_emitter : public jit_emitter { public: - jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_subtract_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_multiply_emitter : public jit_emitter { public: - jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_multiply_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_divide_emitter : public jit_emitter { public: - jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_divide_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; class jit_floor_emitter : public jit_emitter { public: - jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_floor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_ceiling_emitter : public jit_emitter { public: - jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_ceiling_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_floor_mod_emitter : public jit_emitter { public: - jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_floor_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; - class jit_mod_emitter : public jit_emitter { public: - jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_mod_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; - class jit_maximum_emitter : public jit_emitter { public: - jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_maximum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_minimum_emitter : public jit_emitter { public: - jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); + jit_minimum_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_squared_difference_emitter : public jit_emitter { public: - jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_squared_difference_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_power_dynamic_emitter : public jit_emitter { public: - jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_power_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; - class jit_equal_emitter : public jit_emitter { public: - jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_not_equal_emitter : public jit_emitter { public: - jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_not_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_greater_emitter : public jit_emitter { public: - jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_greater_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_greater_equal_emitter : public jit_emitter { public: - jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_greater_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_less_emitter : public jit_emitter { public: - jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_less_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_less_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_less_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_less_equal_emitter : public jit_emitter { public: - jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_less_equal_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_logical_and_emitter : public jit_emitter { public: - jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_logical_or_emitter : public jit_emitter { public: - jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; }; - class jit_logical_xor_emitter : public jit_emitter { public: - jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; @@ -430,19 +500,23 @@ class jit_logical_xor_emitter : public jit_emitter { class jit_logical_not_emitter : public jit_emitter { public: - jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_logical_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; @@ -450,21 +524,26 @@ class jit_logical_not_emitter : public jit_emitter { class jit_power_static_emitter : public jit_emitter { public: - jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - float inpPower, float inpScale, float inpShift, + jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + float inpPower, + float inpScale, + float inpShift, ov::element::Type exec_prc = ov::element::f32); - jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_power_static_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); - + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; @@ -476,73 +555,90 @@ class jit_power_static_emitter : public jit_emitter { class jit_prelu_emitter : public jit_emitter { public: - jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_prelu_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; size_t aux_vecs_count() const override; }; class jit_sqrt_emitter : public jit_emitter { public: - jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_sqrt_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_negative_emitter : public jit_emitter { public: - jit_negative_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_negative_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_exp_emitter : public jit_emitter { public: - jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_exp_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; - bool need_vmm_mask() const { return host_isa_ != dnnl::impl::cpu::x64::avx512_core; } + bool need_vmm_mask() const { + return host_isa_ != dnnl::impl::cpu::x64::avx512_core; + } void register_table_entries() override; size_t aux_vecs_count() const override; @@ -550,103 +646,132 @@ class jit_exp_emitter : public jit_emitter { class jit_erf_emitter : public jit_emitter { public: - jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); + jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); - jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_erf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); void emit_data() const override; size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl( - const std::vector &in_vec_idxs, - const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; size_t aux_vecs_count() const override; - std::unique_ptr m_exp_emitter {nullptr}; + std::unique_ptr m_exp_emitter{nullptr}; }; class jit_soft_sign_emitter : public jit_emitter { public: - jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_soft_sign_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; void register_table_entries() override; }; class jit_is_finite_emitter : public jit_emitter { public: - jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_finite_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + const std::shared_ptr& node, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - size_t get_inputs_num() const override { return 1; }; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + }; + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + size_t aux_gprs_count() const override { + return (entry_map_.empty() ? 0 : 1) + 1; + } void register_table_entries() override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_is_inf_emitter : public jit_emitter { public: - jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, - ov::element::Type execPrc = ov::element::f32, bool detect_negative = true, bool detect_positive = true) - : jit_emitter(host, hostIsa, execPrc), detect_negative(detect_negative), detect_positive(detect_positive) { + jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + ov::element::Type execPrc = ov::element::f32, + bool detect_negative = true, + bool detect_positive = true) + : jit_emitter(host, hostIsa, execPrc), + detect_negative(detect_negative), + detect_positive(detect_positive) { prepare_table(); } - jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, - ov::element::Type execPrc = ov::element::f32): jit_emitter(host, hostIsa, execPrc) { + jit_is_inf_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + const std::shared_ptr& node, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - size_t get_inputs_num() const override { return 1; }; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + }; + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + size_t aux_gprs_count() const override { + return (entry_map_.empty() ? 0 : 1) + 1; + } void register_table_entries() override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; bool detect_negative; bool detect_positive; @@ -654,58 +779,76 @@ class jit_is_inf_emitter : public jit_emitter { class jit_is_nan_emitter : public jit_emitter { public: - jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t hostIsa, const std::shared_ptr& node, - ov::element::Type execPrc = ov::element::f32) : jit_emitter(host, hostIsa, execPrc) { + jit_is_nan_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t hostIsa, + const std::shared_ptr& node, + ov::element::Type execPrc = ov::element::f32) + : jit_emitter(host, hostIsa, execPrc) { prepare_table(); } - size_t get_inputs_num() const override { return 1; } - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_gprs_count() const override { return (entry_map_.empty() ? 0 : 1) + 1; } + size_t aux_gprs_count() const override { + return (entry_map_.empty() ? 0 : 1) + 1; + } void register_table_entries() override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_select_emitter : public jit_emitter { public: - jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_select_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::element::Type exec_prc = ov::element::f32); - jit_select_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, + jit_select_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); size_t aux_vecs_count() const override; private: - void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; template - void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; class jit_bitwise_and_emitter : public jit_emitter { public: - jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_and_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; @@ -716,13 +859,17 @@ class jit_bitwise_and_emitter : public jit_emitter { class jit_bitwise_not_emitter : public jit_emitter { public: - jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_not_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); size_t aux_vecs_count() const override; private: @@ -735,13 +882,17 @@ class jit_bitwise_not_emitter : public jit_emitter { class jit_bitwise_or_emitter : public jit_emitter { public: - jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_or_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; @@ -752,13 +903,17 @@ class jit_bitwise_or_emitter : public jit_emitter { class jit_bitwise_xor_emitter : public jit_emitter { public: - jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32); - jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, - ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32); + jit_bitwise_xor_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n, + ov::element::Type exec_prc = ov::element::f32); size_t get_inputs_num() const override; - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; @@ -767,5 +922,5 @@ class jit_bitwise_xor_emitter : public jit_emitter { void emit_isa(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp index acbb04ea01af80..7ee4d5184b311a 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.cpp @@ -3,9 +3,11 @@ // #include "jit_emitter.hpp" + #include -#include "utils/general_utils.h" + #include "utils.hpp" +#include "utils/general_utils.h" using namespace dnnl::impl::cpu; using namespace dnnl::impl; @@ -19,11 +21,12 @@ size_t jit_emitter::get_max_vecs_count() const { } size_t jit_emitter::get_vec_length() const { - return one_of(host_isa_, cpu::x64::avx512_core, cpu::x64::avx512_core) ? 64 : - one_of(host_isa_, cpu::x64::avx2) ? 32 : 16; + return one_of(host_isa_, cpu::x64::avx512_core, cpu::x64::avx512_core) ? 64 + : one_of(host_isa_, cpu::x64::avx2) ? 32 + : 16; } -void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const { +void jit_emitter::push_vec(const Xbyak::Address& addr, size_t vec_idx) const { if (host_isa_ == cpu::x64::sse41) { h->uni_vmovups(addr, Xmm(vec_idx)); } else if (host_isa_ == cpu::x64::avx2) { @@ -33,7 +36,7 @@ void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const { } } -void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address &addr) const { +void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address& addr) const { if (host_isa_ == cpu::x64::sse41) { h->uni_vmovups(Xmm(vec_idx), addr); } else if (host_isa_ == cpu::x64::avx2) { @@ -60,11 +63,15 @@ std::set> jit_emitter::get_supported_precisions(const return {}; } -void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_emitter::emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { using namespace Xbyak::util; - bool is_vec_input = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr); - bool is_vec_output = (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec); + bool is_vec_input = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::vec_to_gpr); + bool is_vec_output = + (in_out_type_ == emitter_in_out_map::vec_to_vec) || (in_out_type_ == emitter_in_out_map::gpr_to_vec); for (auto idx : pool_vec_idxs) aux_vec_idxs.push_back(idx); @@ -73,9 +80,11 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std if (host_isa_ == cpu::x64::sse41 && aux_vecs_count() > 0) { size_t idx = 0; if (is_vec_input) - OV_CPU_JIT_EMITTER_ASSERT(std::find(in_idxs.begin(), in_idxs.end(), idx) == in_idxs.end(), "Xmm(0) cannot be input register in SSE41"); + OV_CPU_JIT_EMITTER_ASSERT(std::find(in_idxs.begin(), in_idxs.end(), idx) == in_idxs.end(), + "Xmm(0) cannot be input register in SSE41"); if (is_vec_output) - OV_CPU_JIT_EMITTER_ASSERT(std::find(out_idxs.begin(), out_idxs.end(), idx) == out_idxs.end(), "Xmm(0) cannot be output register in SSE41"); + OV_CPU_JIT_EMITTER_ASSERT(std::find(out_idxs.begin(), out_idxs.end(), idx) == out_idxs.end(), + "Xmm(0) cannot be output register in SSE41"); if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) == aux_vec_idxs.end()) { aux_vec_idxs.push_back(idx); preserved_vec_idxs.push_back(idx); @@ -93,16 +102,21 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std } for (size_t idx = 0; idx < get_max_vecs_count(); idx++) { - if (aux_vec_idxs.size() >= aux_vecs_count()) break; + if (aux_vec_idxs.size() >= aux_vecs_count()) + break; if (is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), idx) != in_idxs.end()) + continue; } if (is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), idx) != out_idxs.end()) + continue; } - if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) continue; - if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) continue; + if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) != aux_vec_idxs.end()) + continue; + if (std::find(preserved_vec_idxs.begin(), preserved_vec_idxs.end(), idx) != preserved_vec_idxs.end()) + continue; aux_vec_idxs.push_back(idx); preserved_vec_idxs.push_back(idx); @@ -115,18 +129,24 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std aux_gpr_idxs.push_back(idx); for (size_t gpr_idx = 0; gpr_idx <= Operand::R15; ++gpr_idx) { - size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end + size_t _idx = Operand::R15 - gpr_idx; // we allocate from the end - if (aux_gpr_idxs.size() >= aux_gprs_count()) break; - if (_idx == Operand::RSP) continue; + if (aux_gpr_idxs.size() >= aux_gprs_count()) + break; + if (_idx == Operand::RSP) + continue; if (!is_vec_input) { - if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) continue; + if (std::find(in_idxs.begin(), in_idxs.end(), _idx) != in_idxs.end()) + continue; } if (!is_vec_output) { - if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) continue; + if (std::find(out_idxs.begin(), out_idxs.end(), _idx) != out_idxs.end()) + continue; } - if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) continue; - if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) continue; + if (std::find(aux_gpr_idxs.begin(), aux_gpr_idxs.end(), _idx) != aux_gpr_idxs.end()) + continue; + if (std::find(preserved_gpr_idxs.begin(), preserved_gpr_idxs.end(), _idx) != preserved_gpr_idxs.end()) + continue; aux_gpr_idxs.push_back(_idx); preserved_gpr_idxs.push_back(_idx); @@ -154,7 +174,6 @@ void jit_emitter::emitter_preamble(const std::vector &in_idxs, const std load_table_addr(); } - void jit_emitter::emitter_postamble() const { using namespace Xbyak::util; @@ -183,7 +202,7 @@ void jit_emitter::emit_data() const { // Run through the map and insert values stored there for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - const auto &te = (*it).second; // get map entry for a given key + const auto& te = (*it).second; // get map entry for a given key const auto len = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val); @@ -199,14 +218,16 @@ void jit_emitter::prepare_table() { // prepare_table. size_t off = 0; for (auto it = entry_map_.begin(); it != entry_map_.end(); it++) { - auto &te = (*it).second; + auto& te = (*it).second; te.off = off; off += te.bcast ? get_vec_length() : sizeof(table_entry_val_t); } } -void jit_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); emit_impl(in_idxs, out_idxs); @@ -214,5 +235,5 @@ void jit_emitter::emit_code(const std::vector &in_idxs, const std::vecto emitter_postamble(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp index c5729613f1bfe5..04ac2e6ea0684d 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_emitter.hpp @@ -4,17 +4,17 @@ #pragma once -#include "cpu/x64/jit_generator.hpp" - -#include "snippets/snippets_isa.hpp" -#include "snippets/generator.hpp" -#include "emitters/utils.hpp" #include #include +#include "cpu/x64/jit_generator.hpp" +#include "emitters/utils.hpp" +#include "snippets/generator.hpp" +#include "snippets/snippets_isa.hpp" + #ifdef SNIPPETS_DEBUG_CAPS -#include "emitters/snippets/x64/verbose.hpp" +# include "emitters/snippets/x64/verbose.hpp" #endif namespace ov { @@ -34,14 +34,23 @@ struct emitter_params { class jit_emitter : public ov::snippets::Emitter { public: - jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) - : Emitter(), h(host), host_isa_(host_isa), exec_prc_(exec_prc), l_table (new Xbyak::Label()), in_out_type_(in_out_type) { - k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well + jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type exec_prc = ov::element::f32, + emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) + : Emitter(), + h(host), + host_isa_(host_isa), + exec_prc_(exec_prc), + l_table(new Xbyak::Label()), + in_out_type_(in_out_type) { + k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; void emit_data() const override; virtual size_t get_inputs_num() const = 0; @@ -53,10 +62,11 @@ class jit_emitter : public ov::snippets::Emitter { * Precisions are ordered, the first bigger bitness precision with the same type will be selected. * Empty collection means the emitter supports any input precisions. */ - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); #ifdef SNIPPETS_DEBUG_CAPS - const char *info() const { + const char* info() const { if (!info_.is_initialized()) info_.init(this); return info_.c_str(); @@ -77,12 +87,14 @@ class jit_emitter : public ov::snippets::Emitter { virtual void prepare_table(); virtual void register_table_entries() {} - void load_table_addr() const { h->mov(p_table, *l_table.get()); } + void load_table_addr() const { + h->mov(p_table, *l_table.get()); + } // we accept only 32bit hexadecimal table values to avoid any rounding using table_entry_val_t = uint32_t; - using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table - using table_entry_bcast_t = bool; // true => bcast value + using table_entry_offset_t = size_t; // offsets are in bytes wrt p_table + using table_entry_bcast_t = bool; // true => bcast value struct table_entry_t { table_entry_val_t val; @@ -106,10 +118,12 @@ class jit_emitter : public ov::snippets::Emitter { _cmp_gt_os = dnnl::impl::cpu::x64::jit_generator::_cmp_nle_us, }; - virtual void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const = 0; + virtual void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const = 0; - virtual void emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const; + virtual void emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const; virtual void emitter_postamble() const; emitter_in_out_map in_out_type_; @@ -132,14 +146,14 @@ class jit_emitter : public ov::snippets::Emitter { mapped_table_t entry_map_; void push_arg_entry_of(const std::string key, const table_entry_val_t val, const bool broadcast) { - mapped_table_entry_t te {0, val, broadcast}; + mapped_table_entry_t te{0, val, broadcast}; entry_map_.insert(std::make_pair(key, te)); } - void push_entries_of(const table_t &t) { + void push_entries_of(const table_t& t) { for (auto it = t.begin(); it != t.end(); it++) { auto key = (*it).first; - auto te = (*it).second; // copy values from table + auto te = (*it).second; // copy values from table push_arg_entry_of(key, te.val, te.bcast); } } @@ -155,20 +169,20 @@ class jit_emitter : public ov::snippets::Emitter { mutable std::vector preserved_vec_idxs; mutable std::vector preserved_gpr_idxs; - void push_vec(const Xbyak::Address &addr, size_t vec_idx) const; - void pop_vec(size_t vec_idx, const Xbyak::Address &addr) const; + void push_vec(const Xbyak::Address& addr, size_t vec_idx) const; + void pop_vec(size_t vec_idx, const Xbyak::Address& addr) const; size_t table_off(std::string& key, size_t key_off_val_shift = 0) const { // assumption: all table entries sharing the same key also // share their broadcast property // TODO: enforce through data structure - const auto it = entry_map_.find(key); // search an entry for a key + const auto it = entry_map_.find(key); // search an entry for a key OV_CPU_JIT_EMITTER_ASSERT(it != entry_map_.end(), "Value has not been found in the table"); - const auto &te = (*it).second; + const auto& te = (*it).second; const auto scale = te.bcast ? get_vec_length() : sizeof(table_entry_val_t); return te.off + key_off_val_shift * scale; } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp index 893c18768a9511..513c1f70d22932 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.cpp @@ -3,6 +3,7 @@ // #include "jit_load_store_emitters.hpp" + #include "utils/bfloat16.hpp" using namespace dnnl::impl; @@ -16,19 +17,20 @@ using namespace Xbyak::util; // An auxiliary vector reg(data_reg_new) is used as destination vector for source pollution instructions, // After updated, processed with new vector and no more need to update as source is preserved. // e.g. with STORE_KEEP_SOURCE(vextractf128, xmm, Xmm(aux_src_idx), ymm, 1); -// if ymm is already updated, h->vextractf128(xmm, ymm, 1) is used, which change ymm values as xmm and ymm have the same index. -// if ymm is not updated, h->vextractf128(Xmm(aux_src_idx), ymm, 1) is used, which keep ymm values unchanged as destination is another vector reg. +// if ymm is already updated, h->vextractf128(xmm, ymm, 1) is used, which change ymm values as xmm and ymm have the +// same index. if ymm is not updated, h->vextractf128(Xmm(aux_src_idx), ymm, 1) is used, which keep ymm values +// unchanged as destination is another vector reg. #define STORE_KEEP_SOURCE(instruction, data_reg, data_reg_new, ...) \ - if (data_reg_updated) { \ - h->instruction(data_reg, __VA_ARGS__); \ - } else { \ - h->instruction(data_reg_new, __VA_ARGS__); \ - data_idx = aux_src_idx; \ - xmm = Xbyak::Xmm(data_idx); \ - ymm = Xbyak::Ymm(data_idx); \ - zmm = Xbyak::Zmm(data_idx); \ - vmm = Vmm(data_idx); \ - data_reg_updated = true; \ + if (data_reg_updated) { \ + h->instruction(data_reg, __VA_ARGS__); \ + } else { \ + h->instruction(data_reg_new, __VA_ARGS__); \ + data_idx = aux_src_idx; \ + xmm = Xbyak::Xmm(data_idx); \ + ymm = Xbyak::Ymm(data_idx); \ + zmm = Xbyak::Zmm(data_idx); \ + vmm = Vmm(data_idx); \ + data_reg_updated = true; \ } namespace ov { @@ -39,7 +41,7 @@ namespace { constexpr int threshold_for_mask_emu_load = 14; // heuristic threshold number by byte between mask store and emulation with several simple partial store constexpr int threshold_for_mask_emu_store = 6; -} // namespace +} // namespace size_t load_emitter_params::hash() const { size_t seed = 0; @@ -61,46 +63,69 @@ size_t store_emitter_params::hash() const { return seed; } -static int get_aux_regs_as_temp(const int elem_count, const int data_size, bool is_pure_move, bool is_store_as_real16, - const int avx512_threshold_for_mask = 0, const bool is_fill = false) { +static int get_aux_regs_as_temp(const int elem_count, + const int data_size, + bool is_pure_move, + bool is_store_as_real16, + const int avx512_threshold_for_mask = 0, + const bool is_fill = false) { if (mayiuse(cpu::x64::avx512_core) && is_fill) return 1; // for pure move, there are direct no-mask instructions to move on full xmm/ymm/zmm, so aux_gpr is not needed. // for move+convert: - // there are direct no-mask instructions to load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm as f32/i32, so aux_gpr is not needed. - // there are direct no-mask instructions to store i32 on full xmm/ymm/zmm to i8/u8/i16/u16, so aux_gpr is not needed. - // store f32 on full xmm/ymm/zmm to bf16/fp16, need convert to bf16/fp16 on vmm, then store vmm to memory, use store_dword_to_word/byte_base condition. - // store_num == 16, vector: 16 * f32 -> 16 * bf16 -> ymm(256bit) -> store - // store_num == 8, vector: 8 * f32 -> 8 * bf16 -> xmm(128bit) -> store - // store_num == 4, vector: 4 * f32 -> 4 * bf16 -> 64bit -> masked instruction with aux_gpr needed - // f32<->i32 is on full vmm, so aux_gpr is not needed. + // there are direct no-mask instructions to load i8/u8/i16/u16/bf16/fp16 to full xmm/ymm/zmm as f32/i32, so aux_gpr + // is not needed. there are direct no-mask instructions to store i32 on full xmm/ymm/zmm to i8/u8/i16/u16, so + // aux_gpr is not needed. store f32 on full xmm/ymm/zmm to bf16/fp16, need convert to bf16/fp16 on vmm, then store + // vmm to memory, use store_dword_to_word/byte_base condition. store_num == 16, vector: 16 * f32 -> 16 * bf16 -> + // ymm(256bit) -> store store_num == 8, vector: 8 * f32 -> 8 * bf16 -> xmm(128bit) -> store store_num == 4, + // vector: 4 * f32 -> 4 * bf16 -> 64bit -> masked instruction with aux_gpr needed f32<->i32 is on full vmm, + // so aux_gpr is not needed. const int byte_size = elem_count * data_size; - if ((is_pure_move && one_of(byte_size, 16, 32, 64)) || (!is_pure_move && one_of(elem_count, 4, 8, 16) && !is_store_as_real16)) + if ((is_pure_move && one_of(byte_size, 16, 32, 64)) || + (!is_pure_move && one_of(elem_count, 4, 8, 16) && !is_store_as_real16)) return 0; - if ((mayiuse(cpu::x64::avx512_core) && (byte_size > avx512_threshold_for_mask)) || (one_of(byte_size % 16, 1, 2, 3))) + if ((mayiuse(cpu::x64::avx512_core) && (byte_size > avx512_threshold_for_mask)) || + (one_of(byte_size % 16, 1, 2, 3))) return 1; return 0; } /// LOAD /// -jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, ov::element::Type exec_prc, - bool is_fill, std::string fill_value, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), load_num_(load_num), src_prc_(src_prc), - dst_prc_(dst_prc), is_fill_(is_fill), fill_value_(fill_value) { +jit_load_emitter::jit_load_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + ov::element::Type exec_prc, + bool is_fill, + std::string fill_value, + emitter_in_out_map in_out_type) + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + load_num_(load_num), + src_prc_(src_prc), + dst_prc_(dst_prc), + is_fill_(is_fill), + fill_value_(fill_value) { prepare_table(); load_size_ = load_num * src_prc.size(); v_len_elt_ = get_vec_length() / exec_prc.size(); } -size_t jit_load_emitter::get_inputs_num() const { return 1; } +size_t jit_load_emitter::get_inputs_num() const { + return 1; +} size_t jit_load_emitter::aux_gprs_count() const { // 0 for temp reg for mask load in avx512 if needed - const auto is_pure_load = (src_prc_ == dst_prc_) || - (one_of(src_prc_, ov::element::f32, ov::element::i32) && - one_of(dst_prc_, ov::element::f32, ov::element::i32)); - int count = get_aux_regs_as_temp(load_num_, static_cast(src_prc_.size()), is_pure_load, false, threshold_for_mask_emu_load, is_fill_); + const auto is_pure_load = (src_prc_ == dst_prc_) || (one_of(src_prc_, ov::element::f32, ov::element::i32) && + one_of(dst_prc_, ov::element::f32, ov::element::i32)); + int count = get_aux_regs_as_temp(load_num_, + static_cast(src_prc_.size()), + is_pure_load, + false, + threshold_for_mask_emu_load, + is_fill_); // 1 for table address if (is_fill_) @@ -109,7 +134,7 @@ size_t jit_load_emitter::aux_gprs_count() const { return count; } -void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_load_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { // offset in load emitter is the offset of src gpr register, should be parsed from in_idxs. const int offset = in_idxs.size() == 2 ? in_idxs[1] : 0; if (host_isa_ == cpu::x64::sse41) { @@ -124,7 +149,7 @@ void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std:: } template -void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_idx, const int offset) const { +void jit_load_emitter::emit_isa(const Xbyak::Reg64& reg_src, const int out_vec_idx, const int offset) const { bool matched_prc = (dst_prc_ == src_prc_) || (dst_prc_ == ov::element::f32) || (dst_prc_ == ov::element::i32); if (!matched_prc) { OV_CPU_JIT_EMITTER_THROW("only support output precision of FP32 or I32 or the same precision as input."); @@ -139,43 +164,43 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_i if (src_prc_ == dst_prc_) { load_bytes(Vmm(out_vec_idx), reg_src, offset, load_size_); } else { - // "pure load" + convert. dst_prc must be FP32 or I32. + // "pure load" + convert. dst_prc must be FP32 or I32. switch (src_prc_) { - case ov::element::f32: - case ov::element::i32: - load_bytes(Vmm(out_vec_idx), reg_src, offset, load_size_); - break; - case ov::element::i8: - load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, true, load_size_); - break; - case ov::element::u8: - load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, false, load_size_); - break; - case ov::element::i16: - case ov::element::u16: - case ov::element::bf16: - case ov::element::f16: - load_words_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, src_prc_, load_size_); - break; - default: - OV_CPU_JIT_EMITTER_THROW("has unsupported src precision to load."); + case ov::element::f32: + case ov::element::i32: + load_bytes(Vmm(out_vec_idx), reg_src, offset, load_size_); + break; + case ov::element::i8: + load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, true, load_size_); + break; + case ov::element::u8: + load_bytes_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, false, load_size_); + break; + case ov::element::i16: + case ov::element::u16: + case ov::element::bf16: + case ov::element::f16: + load_words_to_dword_extension(Vmm(out_vec_idx), reg_src, offset, src_prc_, load_size_); + break; + default: + OV_CPU_JIT_EMITTER_THROW("has unsupported src precision to load."); } } // post convert between I32 and FP32 if (src_prc_ != dst_prc_) { switch (dst_prc_) { - case ov::element::f32: - if (!src_prc_.is_real()) - h->uni_vcvtdq2ps(Vmm(out_vec_idx), Vmm(out_vec_idx)); - break; - case ov::element::i32: - if (src_prc_.is_real()) { - h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx)); - } - break; - default: - break; + case ov::element::f32: + if (!src_prc_.is_real()) + h->uni_vcvtdq2ps(Vmm(out_vec_idx), Vmm(out_vec_idx)); + break; + case ov::element::i32: + if (src_prc_.is_real()) { + h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx)); + } + break; + default: + break; } } @@ -186,19 +211,19 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_i } /** -* load_bytes is the utility function to facilitate loading of -* load_size (0 <= load_size <= 64) many contiguous bytes into the Xmm/Ymm/Zmm -* register from the memory referenced by ptr[reg + offset] address. -* -* Functionally, invocation of load_bytes is equivalent to -* the following loop: -* -* for (int idx = 0; idx < load_size; ++idx) -* vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); -* -*/ + * load_bytes is the utility function to facilitate loading of + * load_size (0 <= load_size <= 64) many contiguous bytes into the Xmm/Ymm/Zmm + * register from the memory referenced by ptr[reg + offset] address. + * + * Functionally, invocation of load_bytes is equivalent to + * the following loop: + * + * for (int idx = 0; idx < load_size; ++idx) + * vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); + * + */ template -void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, int load_size) const { +void jit_load_emitter::load_bytes(const Vmm& vmm, const Xbyak::Reg64& reg, int offset, int load_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -249,14 +274,17 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o } // Cornerstone of partial load is combinaion of vpinsrb/w/d. - // As vpinsrb/w/d will not only write(insert) values into vmm, but also read values in vmm to copy from to positions that not in imm mask, - // this could introduce RAW false dependency(we actually do not care about values not in imm mask). - // To eliminate this false dependency, + // As vpinsrb/w/d will not only write(insert) values into vmm, but also read values in vmm to copy from to + // positions that not in imm mask, this could introduce RAW false dependency(we actually do not care about + // values not in imm mask). To eliminate this false dependency, // 1. For 1/2/3/4 bytes tails, replace vpinsrb/w/d with mov,shl etc instructions that don't read vmm. - // Besides eliminate RAW, these instructions have smaller latency, which also bring better perf, especially for small loop iteration case. + // Besides eliminate RAW, these instructions have smaller latency, which also bring better perf, especially + // for small loop iteration case. // 2. For 8/16 bytes, use vmovq/vmovdqu instructions to load, which also don't read src vmm. - // 3. For other size, insert vpxor before vpinsrb/w/d. vpxor and read vmm instructions in previous loop have WAR(write after read) relationship. - // CPU can identify this scenario and assign another physical vector register(register renameing) in next loop to eliminate RAW. + // 3. For other size, insert vpxor before vpinsrb/w/d. vpxor and read vmm instructions in previous loop have + // WAR(write after read) relationship. + // CPU can identify this scenario and assign another physical vector register(register renameing) in next + // loop to eliminate RAW. if (!one_of(bytes_to_load, 0, 1, 2, 3, 4, 8, 16)) { h->uni_vpxor(vmm, vmm, vmm); } @@ -266,121 +294,136 @@ void jit_load_emitter::load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int o h->uni_vmovdqu(xmm, addr(start_bytes)); switch (bytes_to_load) { - case 0: break; - case 1: - h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes)); - h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); - break; - case 2: - h->movzx(Reg32(aux_gpr_idxs[0]), word_addr(start_bytes)); - h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); - break; - case 3: - h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes + 2)); - h->shl(Reg32(aux_gpr_idxs[0]), 16); - h->mov(Reg16(aux_gpr_idxs[0]), word_addr(start_bytes)); - h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); - break; - case 4: h->uni_vmovss(xmm, addr(start_bytes)); break; - case 5: - h->uni_vmovss(xmm, addr(start_bytes)); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 4), 4); - break; - case 6: - h->uni_vmovss(xmm, addr(start_bytes)); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); - break; - case 7: - h->uni_vmovss(xmm, addr(start_bytes)); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 6), 6); - break; - case 8: break; - case 9: h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 8), 8); break; - case 10: h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); break; - case 11: - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 10), 10); - break; - case 12: h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); break; - case 13: - h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 12), 12); - break; - case 14: - h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); - break; - case 15: - h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); - h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); - h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 14), 14); - break; - case 16: break; - default: - OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to load in load_byte."); + case 0: + break; + case 1: + h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes)); + h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); + break; + case 2: + h->movzx(Reg32(aux_gpr_idxs[0]), word_addr(start_bytes)); + h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); + break; + case 3: + h->movzx(Reg32(aux_gpr_idxs[0]), addr(start_bytes + 2)); + h->shl(Reg32(aux_gpr_idxs[0]), 16); + h->mov(Reg16(aux_gpr_idxs[0]), word_addr(start_bytes)); + h->uni_vmovq(xmm, Reg64(aux_gpr_idxs[0])); + break; + case 4: + h->uni_vmovss(xmm, addr(start_bytes)); + break; + case 5: + h->uni_vmovss(xmm, addr(start_bytes)); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 4), 4); + break; + case 6: + h->uni_vmovss(xmm, addr(start_bytes)); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); + break; + case 7: + h->uni_vmovss(xmm, addr(start_bytes)); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 4), 2); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 6), 6); + break; + case 8: + break; + case 9: + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 8), 8); + break; + case 10: + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); + break; + case 11: + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 8), 4); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 10), 10); + break; + case 12: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + break; + case 13: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 12), 12); + break; + case 14: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); + break; + case 15: + h->uni_vpinsrd(xmm, xmm, addr(start_bytes + 8), 2); + h->uni_vpinsrw(xmm, xmm, addr(start_bytes + 12), 6); + h->uni_vpinsrb(xmm, xmm, addr(start_bytes + 14), 14); + break; + case 16: + break; + default: + OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to load in load_byte."); } if (has_xmm_block) { - h->vinsertf128(ymm, ymm, xmm, 1); // insert to upper bits of ymm + h->vinsertf128(ymm, ymm, xmm, 1); // insert to upper bits of ymm if (has_ymm_block) - h->vinsertf128(ymm, ymm, addr(32), 0); // insert to lower bits of ymm + h->vinsertf128(ymm, ymm, addr(32), 0); // insert to lower bits of ymm else - h->vinsertf128(ymm, ymm, addr(0), 0); // insert to lower bits of ymm + h->vinsertf128(ymm, ymm, addr(0), 0); // insert to lower bits of ymm } if (has_ymm_block) { - h->vinsertf64x4(zmm, zmm, ymm, 1); // insert to upper bits of zmm - h->vinsertf64x4(zmm, zmm, addr(0), 0); // insert to lower bits of zmm + h->vinsertf64x4(zmm, zmm, ymm, 1); // insert to upper bits of zmm + h->vinsertf64x4(zmm, zmm, addr(0), 0); // insert to lower bits of zmm } }; switch (load_size) { - case 64: - h->uni_vmovdqu(zmm, addr(0)); - break; - case 32: - h->uni_vmovdqu(ymm, addr(0)); - break; - case 16: - h->uni_vmovdqu(xmm, addr(0)); - break; - default: { - if (mayiuse(cpu::x64::avx512_core) && load_size > threshold_for_mask_emu_load) { - uint64_t mask = 1; - mask = (mask << load_size) - mask; - h->mov(Reg64(aux_gpr_idxs[0]), mask); - h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); - h->vmovdqu8(zmm | k_mask | T_z, addr(0)); - } else { - load_byte_base(); - } - break; + case 64: + h->uni_vmovdqu(zmm, addr(0)); + break; + case 32: + h->uni_vmovdqu(ymm, addr(0)); + break; + case 16: + h->uni_vmovdqu(xmm, addr(0)); + break; + default: { + if (mayiuse(cpu::x64::avx512_core) && load_size > threshold_for_mask_emu_load) { + uint64_t mask = 1; + mask = (mask << load_size) - mask; + h->mov(Reg64(aux_gpr_idxs[0]), mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vmovdqu8(zmm | k_mask | T_z, addr(0)); + } else { + load_byte_base(); } + break; + } } } /** -* load_bytes_to_dword_extension is the utility function to facilitate -* loading of load_size (0 <= load_size <= 16) many contiguous bytes in -* the xmm register from the memory referenced by ptr[reg + offset] -* address and then do signed/zero extension of those to double words. -* -* Functionally, invocation of load_bytes_to_dword_extension is equivalent -* to the following: -* -* for (int idx = 0; idx < load_size; ++idx) -* vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); -* if (is_signed) vpmovsxbd(vmm, vmm); else vpmovzxbd(vmm, vmm); -* -* Valid values for the load_size variable are: -* [0..4] for XMM version of the function, i.e. 4 bytes -> 4 * 32 bit == 128 bit -* [0..8] for YMM version of the function. i.e. 8 bytes -> 8 * 32 bit == 256 bit -* [0..16] for ZMM version of the function. i.e. 16 bytes -> 16 * 32 bit == 512 bit -*/ + * load_bytes_to_dword_extension is the utility function to facilitate + * loading of load_size (0 <= load_size <= 16) many contiguous bytes in + * the xmm register from the memory referenced by ptr[reg + offset] + * address and then do signed/zero extension of those to double words. + * + * Functionally, invocation of load_bytes_to_dword_extension is equivalent + * to the following: + * + * for (int idx = 0; idx < load_size; ++idx) + * vpinsrb(vmm, vmm, ptr[reg + offset + idx], idx); + * if (is_signed) vpmovsxbd(vmm, vmm); else vpmovzxbd(vmm, vmm); + * + * Valid values for the load_size variable are: + * [0..4] for XMM version of the function, i.e. 4 bytes -> 4 * 32 bit == 128 bit + * [0..8] for YMM version of the function. i.e. 8 bytes -> 8 * 32 bit == 256 bit + * [0..16] for ZMM version of the function. i.e. 16 bytes -> 16 * 32 bit == 512 bit + */ template -void jit_load_emitter::load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, bool is_signed, int load_size) const { +void jit_load_emitter::load_bytes_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + bool is_signed, + int load_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -401,76 +444,80 @@ void jit_load_emitter::load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak // For load_size == 4/8/16, do load/extension in one go switch (load_size) { - case 16: { - // full size of zmm - const auto zmm = Xbyak::Zmm(vmm.getIdx()); - if (is_signed) - h->uni_vpmovsxbd(zmm, ptr[reg + offset]); - else - h->uni_vpmovzxbd(zmm, ptr[reg + offset]); - break; - } - case 8: { - // full size of ymm or ymm_block of zmm - const auto ymm = Xbyak::Ymm(vmm.getIdx()); + case 16: { + // full size of zmm + const auto zmm = Xbyak::Zmm(vmm.getIdx()); + if (is_signed) + h->uni_vpmovsxbd(zmm, ptr[reg + offset]); + else + h->uni_vpmovzxbd(zmm, ptr[reg + offset]); + break; + } + case 8: { + // full size of ymm or ymm_block of zmm + const auto ymm = Xbyak::Ymm(vmm.getIdx()); + if (is_signed) + h->uni_vpmovsxbd(ymm, ptr[reg + offset]); + else + h->uni_vpmovzxbd(ymm, ptr[reg + offset]); + break; + } + case 4: { + // full size of xmm or xmm_block of ymm/zmm + const auto xmm = Xbyak::Xmm(vmm.getIdx()); + if (is_signed) + h->uni_vpmovsxbd(xmm, ptr[reg + offset]); + else + h->uni_vpmovzxbd(xmm, ptr[reg + offset]); + break; + } + default: { + if (is_zmm && load_size > threshold_for_mask_emu_load) { + unsigned int mask = 1; + mask = (mask << load_size) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); if (is_signed) - h->uni_vpmovsxbd(ymm, ptr[reg + offset]); + h->uni_vpmovsxbd(vmm | k_mask | T_z, ptr[reg + offset]); else - h->uni_vpmovzxbd(ymm, ptr[reg + offset]); - break; - } - case 4: { - // full size of xmm or xmm_block of ymm/zmm + h->uni_vpmovzxbd(vmm | k_mask | T_z, ptr[reg + offset]); + } else { const auto xmm = Xbyak::Xmm(vmm.getIdx()); + load_bytes(xmm, reg, offset, load_size); if (is_signed) - h->uni_vpmovsxbd(xmm, ptr[reg + offset]); + h->uni_vpmovsxbd(vmm, xmm); else - h->uni_vpmovzxbd(xmm, ptr[reg + offset]); - break; - } - default: { - if (is_zmm && load_size > threshold_for_mask_emu_load) { - unsigned int mask = 1; - mask = (mask << load_size) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); - if (is_signed) - h->uni_vpmovsxbd(vmm | k_mask | T_z, ptr[reg + offset]); - else - h->uni_vpmovzxbd(vmm | k_mask | T_z, ptr[reg + offset]); - } else { - const auto xmm = Xbyak::Xmm(vmm.getIdx()); - load_bytes(xmm, reg, offset, load_size); - if (is_signed) - h->uni_vpmovsxbd(vmm, xmm); - else - h->uni_vpmovzxbd(vmm, xmm); - } - break; + h->uni_vpmovzxbd(vmm, xmm); } + break; + } } } /** -* load_words_to_dword_extension is the utility function to facilitate -* loading of load_size (0 <= load_size <= 32) byte many contiguous words(num == load_size / 2) -* in the Vmm register from the memory referenced by ptr[reg + offset] -* address and then do signed/zero extension of those to double words. -* -* Functionally, invocation of load_words_to_dword_extension is equivalent -* to the following extended pseudo code: -* -* for (int idx = 0; idx < load_size / 2; ++idx) -* vpinsrw(vmm, vmm, ptr[reg + offset + 2 * idx], idx); -* if (is_signed) vpmovsxwd(vmm, vmm); else vpmovzxwd(vmm, vmm); -* -* Valid values for the load_size variable are: -* [0..8] for XMM version of the function. i.e. 4 words -> 4 * 32 bit == 128 bit -* [0..16] for YMM version of the function. i.e. 8 words -> 8 * 32 bit == 256 bit -* [0.. 32] for ZMM version of the function. i.e. 16 words -> 16 * 32 bit == 512 bit -*/ + * load_words_to_dword_extension is the utility function to facilitate + * loading of load_size (0 <= load_size <= 32) byte many contiguous words(num == load_size / 2) + * in the Vmm register from the memory referenced by ptr[reg + offset] + * address and then do signed/zero extension of those to double words. + * + * Functionally, invocation of load_words_to_dword_extension is equivalent + * to the following extended pseudo code: + * + * for (int idx = 0; idx < load_size / 2; ++idx) + * vpinsrw(vmm, vmm, ptr[reg + offset + 2 * idx], idx); + * if (is_signed) vpmovsxwd(vmm, vmm); else vpmovzxwd(vmm, vmm); + * + * Valid values for the load_size variable are: + * [0..8] for XMM version of the function. i.e. 4 words -> 4 * 32 bit == 128 bit + * [0..16] for YMM version of the function. i.e. 8 words -> 8 * 32 bit == 256 bit + * [0.. 32] for ZMM version of the function. i.e. 16 words -> 16 * 32 bit == 512 bit + */ template -void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, ov::element::Type prc, int load_size) const { +void jit_load_emitter::load_words_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + ov::element::Type prc, + int load_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -503,87 +550,87 @@ void jit_load_emitter::load_words_to_dword_extension(const Vmm &vmm, const Xbyak // For load_size == 32/16/8, do load/extension in one go // including xmm/ymm tail block for ymm/zmm, so explicite xmm/ymm/zmm switch (load_size) { - case 32: { - if (is_bf16) { + case 32: { + if (is_bf16) { + h->uni_vpmovzxwd(zmm, ptr[reg + offset]); + h->uni_vpslld(zmm, zmm, 16); + } else if (is_f16) { + h->vcvtph2ps(zmm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(zmm, ptr[reg + offset]); + else h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - h->uni_vpslld(zmm, zmm, 16); - } else if (is_f16) { - h->vcvtph2ps(zmm, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(zmm, ptr[reg + offset]); - else - h->uni_vpmovzxwd(zmm, ptr[reg + offset]); - } - break; } - case 16: { - if (is_bf16) { + break; + } + case 16: { + if (is_bf16) { + h->uni_vpmovzxwd(ymm, ptr[reg + offset]); + h->uni_vpslld(ymm, ymm, 16); + } else if (is_f16) { + h->vcvtph2ps(ymm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(ymm, ptr[reg + offset]); + else h->uni_vpmovzxwd(ymm, ptr[reg + offset]); - h->uni_vpslld(ymm, ymm, 16); + } + break; + } + case 8: { + if (is_bf16) { + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + h->uni_vpslld(xmm, xmm, 16); + } else if (is_f16) { + h->vcvtph2ps(xmm, ptr[reg + offset]); + } else { + if (is_signed) + h->uni_vpmovsxwd(xmm, ptr[reg + offset]); + else + h->uni_vpmovzxwd(xmm, ptr[reg + offset]); + } + break; + } + default: { + if (is_zmm && load_size > threshold_for_mask_emu_load) { + unsigned int mask = 1; + mask = (mask << (load_size / 2)) - mask; + h->mov(Reg32(aux_gpr_idxs[0]), mask); + h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); + if (is_bf16) { + h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); + h->uni_vpslld(vmm, vmm, 16); } else if (is_f16) { - h->vcvtph2ps(ymm, ptr[reg + offset]); + h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); } else { if (is_signed) - h->uni_vpmovsxwd(ymm, ptr[reg + offset]); + h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); else - h->uni_vpmovzxwd(ymm, ptr[reg + offset]); + h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); } - break; - } - case 8: { + } else { + // xmm or ymm version + load_bytes(xmm, reg, offset, load_size); if (is_bf16) { - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - h->uni_vpslld(xmm, xmm, 16); + h->uni_vpmovzxwd(vmm, xmm); + h->uni_vpslld(vmm, vmm, 16); } else if (is_f16) { - h->vcvtph2ps(xmm, ptr[reg + offset]); + h->vcvtph2ps(ymm, xmm); } else { if (is_signed) - h->uni_vpmovsxwd(xmm, ptr[reg + offset]); + h->uni_vpmovsxwd(vmm, xmm); else - h->uni_vpmovzxwd(xmm, ptr[reg + offset]); - } - break; - } - default: { - if (is_zmm && load_size > threshold_for_mask_emu_load) { - unsigned int mask = 1; - mask = (mask << (load_size / 2)) - mask; - h->mov(Reg32(aux_gpr_idxs[0]), mask); - h->kmovw(k_mask, Reg32(aux_gpr_idxs[0])); - if (is_bf16) { - h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); - h->uni_vpslld(vmm, vmm, 16); - } else if (is_f16) { - h->vcvtph2ps(vmm | k_mask | T_z, ptr[reg + offset]); - } else { - if (is_signed) - h->uni_vpmovsxwd(vmm | k_mask | T_z, ptr[reg + offset]); - else - h->uni_vpmovzxwd(vmm | k_mask | T_z, ptr[reg + offset]); - } - } else { - // xmm or ymm version - load_bytes(xmm, reg, offset, load_size); - if (is_bf16) { h->uni_vpmovzxwd(vmm, xmm); - h->uni_vpslld(vmm, vmm, 16); - } else if (is_f16) { - h->vcvtph2ps(ymm, xmm); - } else { - if (is_signed) - h->uni_vpmovsxwd(vmm, xmm); - else - h->uni_vpmovzxwd(vmm, xmm); - } } - break; } + break; + } } } template -void jit_load_emitter::fill_with_default(const Vmm &vmm, std::string fill_value, const int &load_num) const { +void jit_load_emitter::fill_with_default(const Vmm& vmm, std::string fill_value, const int& load_num) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -614,10 +661,20 @@ void jit_load_emitter::register_table_entries() { } /// STORE /// -jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, arithmetic_mode mode, ov::element::Type exec_prc, +jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, + arithmetic_mode mode, + ov::element::Type exec_prc, emitter_in_out_map in_out_type) - : jit_emitter(host, host_isa, exec_prc, in_out_type), name_("unknown"), store_num_(store_num), src_prc_(src_prc), dst_prc_(dst_prc), mode_(mode) { + : jit_emitter(host, host_isa, exec_prc, in_out_type), + name_("unknown"), + store_num_(store_num), + src_prc_(src_prc), + dst_prc_(dst_prc), + mode_(mode) { prepare_table(); v_len_elt_ = get_vec_length() / exec_prc.size(); store_size_ = store_num * dst_prc.size(); @@ -630,17 +687,20 @@ inline bool jit_store_emitter::is_saturation() const { // case for SSE and AVX2 when we should use AND to truncate values inline bool jit_store_emitter::is_truncation_emulation() const { - return !mayiuse(cpu::x64::avx512_core) && !is_saturation() && - src_prc_ != dst_prc_ && one_of(dst_prc_, ov::element::u16, ov::element::i16, ov::element::u8, ov::element::i8); + return !mayiuse(cpu::x64::avx512_core) && !is_saturation() && src_prc_ != dst_prc_ && + one_of(dst_prc_, ov::element::u16, ov::element::i16, ov::element::u8, ov::element::i8); } size_t jit_store_emitter::aux_gprs_count() const { // for temp reg for store(mask version or special number cases) - const auto is_pure_store = (src_prc_ == dst_prc_) || - (one_of(src_prc_, ov::element::f32, ov::element::i32) && - one_of(dst_prc_, ov::element::f32, ov::element::i32)); + const auto is_pure_store = (src_prc_ == dst_prc_) || (one_of(src_prc_, ov::element::f32, ov::element::i32) && + one_of(dst_prc_, ov::element::f32, ov::element::i32)); const auto is_store_as_real16 = one_of(dst_prc_, ov::element::bf16, ov::element::f16); - int count = get_aux_regs_as_temp(store_num_, static_cast(dst_prc_.size()), is_pure_store, is_store_as_real16, threshold_for_mask_emu_store); + int count = get_aux_regs_as_temp(store_num_, + static_cast(dst_prc_.size()), + is_pure_store, + is_store_as_real16, + threshold_for_mask_emu_store); // for table value in truncation arithmetic mode if (is_truncation_emulation()) @@ -661,14 +721,17 @@ size_t jit_store_emitter::aux_vecs_count() const { if ((host_isa_ == cpu::x64::sse41) && (src_prc_ == ov::element::f32 && dst_prc_ == ov::element::bf16)) count++; - // zero value, zeroed and passed from caller from performance standpoint(zeroed one time and not need preserve and restore status) + // zero value, zeroed and passed from caller from performance standpoint(zeroed one time and not need preserve and + // restore status) if (mayiuse(cpu::x64::avx512_core) && one_of(dst_prc_, ov::element::u8, ov::element::u16)) count++; return count; } -size_t jit_store_emitter::get_inputs_num() const { return 1; } +size_t jit_store_emitter::get_inputs_num() const { + return 1; +} void jit_store_emitter::emit_data() const { jit_emitter::emit_data(); @@ -676,7 +739,7 @@ void jit_store_emitter::emit_data() const { uni_vcvtneps2bf16_->emit_data(); } -void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_store_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { // offset in store emitter is the offset of dst gpr register, should be parsed from out_idxs. const int offset = out_idxs.size() == 2 ? out_idxs[1] : 0; if (host_isa_ == cpu::x64::sse41) { @@ -691,7 +754,7 @@ void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std: } template -void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_dst, const int offset) const { +void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64& reg_dst, const int offset) const { bool matched_prc = (src_prc_ == dst_prc_) || (src_prc_ == ov::element::f32) || (src_prc_ == ov::element::i32); if (!matched_prc) { OV_CPU_JIT_EMITTER_THROW("only support input precision of FP32 or I32 or the same precision as output."); @@ -707,29 +770,29 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_d data_idx = in_vec_idx; data_reg_updated = false; if (!aux_vec_idxs.empty()) - aux_src_idx = aux_vec_idxs.back(); // to avoid src pollution + aux_src_idx = aux_vec_idxs.back(); // to avoid src pollution if (src_prc_ != dst_prc_) { switch (src_prc_) { - case ov::element::f32: - if (!dst_prc_.is_real()) { - if (is_saturation()) { - h->uni_vcvtps2dq(Vmm(aux_src_idx), Vmm(data_idx)); - } else { - h->uni_vcvttps2dq(Vmm(aux_src_idx), Vmm(data_idx)); - } - data_idx = aux_src_idx; - data_reg_updated = true; - } - break; - case ov::element::i32: - if (dst_prc_.is_real()) { - h->uni_vcvtdq2ps(Vmm(aux_src_idx), Vmm(data_idx)); - data_idx = aux_src_idx; - data_reg_updated = true; + case ov::element::f32: + if (!dst_prc_.is_real()) { + if (is_saturation()) { + h->uni_vcvtps2dq(Vmm(aux_src_idx), Vmm(data_idx)); + } else { + h->uni_vcvttps2dq(Vmm(aux_src_idx), Vmm(data_idx)); } - break; - default: - break; + data_idx = aux_src_idx; + data_reg_updated = true; + } + break; + case ov::element::i32: + if (dst_prc_.is_real()) { + h->uni_vcvtdq2ps(Vmm(aux_src_idx), Vmm(data_idx)); + data_idx = aux_src_idx; + data_reg_updated = true; + } + break; + default: + break; } } @@ -737,44 +800,44 @@ void jit_store_emitter::emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_d store_bytes(reg_dst, offset, store_size_); } else { switch (dst_prc_) { - case ov::element::f32: - case ov::element::i32: - store_bytes(reg_dst, offset, store_size_); - break; - case ov::element::i8: - store_dword_to_byte_extension(reg_dst, offset, true, store_num_); - break; - case ov::element::u8: - store_dword_to_byte_extension(reg_dst, offset, false, store_num_); - break; - case ov::element::i16: - case ov::element::u16: - case ov::element::bf16: - case ov::element::f16: - store_dword_to_word_extension(reg_dst, offset, dst_prc_, store_num_); - break; - default: - OV_CPU_JIT_EMITTER_THROW("has unsupported dst precision to store."); + case ov::element::f32: + case ov::element::i32: + store_bytes(reg_dst, offset, store_size_); + break; + case ov::element::i8: + store_dword_to_byte_extension(reg_dst, offset, true, store_num_); + break; + case ov::element::u8: + store_dword_to_byte_extension(reg_dst, offset, false, store_num_); + break; + case ov::element::i16: + case ov::element::u16: + case ov::element::bf16: + case ov::element::f16: + store_dword_to_word_extension(reg_dst, offset, dst_prc_, store_num_); + break; + default: + OV_CPU_JIT_EMITTER_THROW("has unsupported dst precision to store."); } } } /** -* store_bytes is the utility function to facilitate storing of -* store_size (0 <= store_size <= 64) many contiguous bytes from the Xmm/Ymm/Zmm -* register into the memory referenced by ptr[reg + offset] address. -* -* Additionally, when store_size > 16, the input Ymm register will not be -* preserved due to the usage of vextracti128 instruction. -* -* Functionally, invocation of store_bytes is equivalent -* to the following loop: -* -* for (int idx = 0; idx < store_size; ++idx) -* vpextrb(ptr[reg + offset + idx], vmm, idx); -* -*/ + * store_bytes is the utility function to facilitate storing of + * store_size (0 <= store_size <= 64) many contiguous bytes from the Xmm/Ymm/Zmm + * register into the memory referenced by ptr[reg + offset] address. + * + * Additionally, when store_size > 16, the input Ymm register will not be + * preserved due to the usage of vextracti128 instruction. + * + * Functionally, invocation of store_bytes is equivalent + * to the following loop: + * + * for (int idx = 0; idx < store_size; ++idx) + * vpextrb(ptr[reg + offset + idx], vmm, idx); + * + */ template -void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int store_size) const { +void jit_store_emitter::store_bytes(const Xbyak::Reg64& reg, int offset, int store_size) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -805,7 +868,7 @@ void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int sto int bytes_to_store = store_size; if (store_size > 32) { - h->uni_vmovdqu(addr(0), ymm); // store lower bits from zmm + h->uni_vmovdqu(addr(0), ymm); // store lower bits from zmm start_bytes += 32; bytes_to_store -= 32; // load upper bits from zmm into ymm @@ -813,7 +876,7 @@ void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int sto } if (bytes_to_store > 16) { - h->uni_vmovdqu(addr(start_bytes), xmm); // store lower bits from ymm + h->uni_vmovdqu(addr(start_bytes), xmm); // store lower bits from ymm start_bytes += 16; bytes_to_store -= 16; // load upper bits from ymm into xmm @@ -834,93 +897,108 @@ void jit_store_emitter::store_bytes(const Xbyak::Reg64 ®, int offset, int sto h->mov(addr(start_bytes + bytes_offset), Reg8(gpr_idx, ext8bit)); }; switch (bytes_to_store) { - case 0: break; - case 1: - h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); - store_one_byte(0, aux_gpr_idxs[0]); - break; - case 2: - h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); - h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); - break; - case 3: - h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); - h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); - h->shr(Reg64(aux_gpr_idxs[0]), 16); - store_one_byte(2, aux_gpr_idxs[0]); - break; - case 4: h->uni_vmovss(addr(start_bytes), xmm); break; - case 5: - h->uni_vmovss(addr(start_bytes), xmm); - h->uni_vpextrb(addr(start_bytes + 4), xmm, 4); - break; - case 6: - h->uni_vmovss(addr(start_bytes), xmm); - h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); - break; - case 7: - h->uni_vmovss(addr(start_bytes), xmm); - h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); - h->uni_vpextrb(addr(start_bytes + 6), xmm, 6); - break; - case 8: break; - case 9: h->uni_vpextrb(addr(start_bytes + 8), xmm, 8); break; - case 10: h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); break; - case 11: - h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); - h->uni_vpextrb(addr(start_bytes + 10), xmm, 10); - break; - case 12: h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); break; - case 13: - h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); - h->uni_vpextrb(addr(start_bytes + 12), xmm, 12); - break; - case 14: - h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); - h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); - break; - case 15: - h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); - h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); - h->uni_vpextrb(addr(start_bytes + 14), xmm, 14); - break; - case 16: break; - default: - OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to store in store_bytes."); - } - }; - - switch (store_size) { - case 64: - h->uni_vmovdqu(addr(0), zmm); + case 0: + break; + case 1: + h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); + store_one_byte(0, aux_gpr_idxs[0]); + break; + case 2: + h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); + h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); + break; + case 3: + h->uni_vmovq(Reg64(aux_gpr_idxs[0]), xmm); + h->mov(addr(start_bytes), Reg16(aux_gpr_idxs[0])); + h->shr(Reg64(aux_gpr_idxs[0]), 16); + store_one_byte(2, aux_gpr_idxs[0]); + break; + case 4: + h->uni_vmovss(addr(start_bytes), xmm); + break; + case 5: + h->uni_vmovss(addr(start_bytes), xmm); + h->uni_vpextrb(addr(start_bytes + 4), xmm, 4); + break; + case 6: + h->uni_vmovss(addr(start_bytes), xmm); + h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); + break; + case 7: + h->uni_vmovss(addr(start_bytes), xmm); + h->uni_vpextrw(addr(start_bytes + 4), xmm, 2); + h->uni_vpextrb(addr(start_bytes + 6), xmm, 6); + break; + case 8: break; - case 32: - h->uni_vmovdqu(addr(0), ymm); + case 9: + h->uni_vpextrb(addr(start_bytes + 8), xmm, 8); + break; + case 10: + h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); + break; + case 11: + h->uni_vpextrw(addr(start_bytes + 8), xmm, 4); + h->uni_vpextrb(addr(start_bytes + 10), xmm, 10); + break; + case 12: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + break; + case 13: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrb(addr(start_bytes + 12), xmm, 12); + break; + case 14: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); + break; + case 15: + h->uni_vpextrd(addr(start_bytes + 8), xmm, 2); + h->uni_vpextrw(addr(start_bytes + 12), xmm, 6); + h->uni_vpextrb(addr(start_bytes + 14), xmm, 14); break; case 16: - h->uni_vmovdqu(addr(0), xmm); break; default: - if (mayiuse(cpu::x64::avx512_core) && store_size > threshold_for_mask_emu_store) { - uint64_t mask = 1; - mask = (mask << store_size) - mask; - h->mov(Reg64(aux_gpr_idxs[0]), mask); - h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); - h->vmovdqu8(addr(0), zmm | k_mask); - } else { - store_byte_base(); - } - break; + OV_CPU_JIT_EMITTER_THROW("has unexpected number of values to store in store_bytes."); + } + }; + + switch (store_size) { + case 64: + h->uni_vmovdqu(addr(0), zmm); + break; + case 32: + h->uni_vmovdqu(addr(0), ymm); + break; + case 16: + h->uni_vmovdqu(addr(0), xmm); + break; + default: + if (mayiuse(cpu::x64::avx512_core) && store_size > threshold_for_mask_emu_store) { + uint64_t mask = 1; + mask = (mask << store_size) - mask; + h->mov(Reg64(aux_gpr_idxs[0]), mask); + h->kmovq(k_mask, Reg64(aux_gpr_idxs[0])); + h->vmovdqu8(addr(0), zmm | k_mask); + } else { + store_byte_base(); + } + break; } } /** -* store_dword_to_byte_extension is the utility function to -* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes, singed or unsinged, truncated or saturated. -* 2. store the packed byte into the memory referenced by ptr[reg + offset] address. -*/ + * store_dword_to_byte_extension is the utility function to + * 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num bytes, singed or unsinged, + * truncated or saturated. + * 2. store the packed byte into the memory referenced by ptr[reg + offset] address. + */ template -void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64 ®, int offset, bool is_signed, int store_num) const { +void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64& reg, + int offset, + bool is_signed, + int store_num) const { constexpr bool is_xmm = std::is_same::value; constexpr bool is_ymm = std::is_same::value; constexpr bool is_zmm = std::is_same::value; @@ -1032,7 +1110,7 @@ void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64 ®, i break; case 4: if (mayiuse(cpu::x64::avx512_core)) { - if (is_saturation()) { // xmm block on avx512F + VL + if (is_saturation()) { // xmm block on avx512F + VL if (is_signed) { h->vpmovsdb(addr(0), xmm); } else { @@ -1074,13 +1152,16 @@ void jit_store_emitter::store_dword_to_byte_extension(const Xbyak::Reg64 ®, i } /** -* store_dword_to_word_extension is the utility function to -* 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num words with singed or unsinged saturation. -* 2. store the packed words into the memory referenced by ptr[reg + offset] address. -*/ + * store_dword_to_word_extension is the utility function to + * 1. convert store_num (0 <= store_num <= 16) dwords in the Xmm/Ymm/Zmm to store_num words with singed or unsinged + * saturation. + * 2. store the packed words into the memory referenced by ptr[reg + offset] address. + */ template -void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, - int offset, ov::element::Type precision, int store_num) const { +void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64& reg, + int offset, + ov::element::Type precision, + int store_num) const { const bool is_bf16 = (precision == ov::element::bf16); const bool is_f16 = (precision == ov::element::f16); const bool is_signed = precision.is_signed(); @@ -1151,7 +1232,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, if (is_bf16) { if (mayiuse(cpu::x64::avx512_core)) { - // to avoid src vmm pollution, this check means no precision convert happens, so data_idx is still original_data_idx. + // to avoid src vmm pollution, this check means no precision convert happens, so data_idx is still + // original_data_idx. if (src_prc_ == ov::element::f32) { ymm = Ymm(aux_vec_idxs[0]); } @@ -1171,7 +1253,8 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, if (host_isa_ == cpu::x64::sse41 && src_prc_ == ov::element::f32) { auto xmm_aux1 = Xmm(aux_vec_idxs[1]); h->uni_vmovups(xmm_aux1, vmm); - uni_vcvtneps2bf16_->emit_code({static_cast(vmm.getIdx())}, {static_cast(vmm.getIdx())}, + uni_vcvtneps2bf16_->emit_code({static_cast(vmm.getIdx())}, + {static_cast(vmm.getIdx())}, {static_cast(xmm.getIdx())}); h->uni_vmovups(xmm, vmm); h->uni_vmovups(vmm, xmm_aux1); // return original data to src vmm @@ -1222,7 +1305,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, Vmm zero(aux_vec_idxs[0]); h->uni_vpxor(zero, zero, zero); STORE_KEEP_SOURCE(uni_vpmaxsd, vmm, Vmm(aux_src_idx), vmm, zero); - h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16. + h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16. } } else { h->vpmovdw(ptr[reg + offset], vmm); @@ -1261,7 +1344,7 @@ void jit_store_emitter::store_dword_to_word_extension(const Xbyak::Reg64 ®, h->vpmovdw(ptr[reg + offset], xmm); } } else { - store_dword_to_word_base(); + store_dword_to_word_base(); } break; default: @@ -1297,5 +1380,5 @@ void jit_store_emitter::register_table_entries() { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp index 9570a836aa64ee..2c4e15ccaeb28b 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_load_store_emitters.hpp @@ -4,16 +4,23 @@ #pragma once -#include "jit_emitter.hpp" #include "jit_bf16_emitters.hpp" +#include "jit_emitter.hpp" namespace ov { namespace intel_cpu { struct load_emitter_params : public emitter_params { - load_emitter_params(ov::element::Type src_prc, ov::element::Type dst_prc, - int load_num, bool is_fill = false, std::string fill_value = "zero"): - src_prc_(src_prc), dst_prc_(dst_prc), load_num_(load_num), is_fill_(is_fill), fill_value_(fill_value) {} + load_emitter_params(ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, + bool is_fill = false, + std::string fill_value = "zero") + : src_prc_(src_prc), + dst_prc_(dst_prc), + load_num_(load_num), + is_fill_(is_fill), + fill_value_(fill_value) {} size_t hash() const override; @@ -25,8 +32,10 @@ struct load_emitter_params : public emitter_params { }; struct store_emitter_params : public emitter_params { - store_emitter_params(ov::element::Type src_prc, ov::element::Type dst_prc, int store_num): - src_prc_(src_prc), dst_prc_(dst_prc), store_num_(store_num) {} + store_emitter_params(ov::element::Type src_prc, ov::element::Type dst_prc, int store_num) + : src_prc_(src_prc), + dst_prc_(dst_prc), + store_num_(store_num) {} size_t hash() const override; @@ -36,57 +45,61 @@ struct store_emitter_params : public emitter_params { }; // Arithmetic modes for data type conversion in store_emitter -enum arithmetic_mode { - saturation, - truncation -}; +enum arithmetic_mode { saturation, truncation }; class jit_load_emitter : public jit_emitter { public: - jit_load_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int load_num, + jit_load_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int load_num, ov::element::Type exec_prc = ov::element::f32, - bool is_fill = false, std::string fill_value = "zero", + bool is_fill = false, + std::string fill_value = "zero", emitter_in_out_map in_out_type = emitter_in_out_map::gpr_to_vec); /** - * load_num values with src_prc precision are loaded from ptr[Reg64(in_idxs[0]) + offset_byte] address to Vmm[out_idxs[0]] as dst_prc, where offset_byte is in_idxs[1] - * is_fill: when load_num can not fully fit in vector register, whether fill_value should be filled as default values. - * fill_value: when load_num can not fully fit in vector register, what values should be filled as default values. - * currently support "zero", "int_one", "float_one", "int32_min", "float_min", "int32_max" and "float_max". - * supported src_prc and dst_prc pairs are as below(x indicate for support): - * FP32 I32 I16 U16 I8 U8 BF16 --> src_prc - * FP32 x x x x x x x - * I32 x x x x x x x - * I16 x - * U16 x - * I8 x - * U8 x - * BF16 x - * | - * \|/ - * dst_prc - */ + * load_num values with src_prc precision are loaded from ptr[Reg64(in_idxs[0]) + offset_byte] address to + * Vmm[out_idxs[0]] as dst_prc, where offset_byte is in_idxs[1] is_fill: when load_num can not fully fit in vector + * register, whether fill_value should be filled as default values. fill_value: when load_num can not fully fit in + * vector register, what values should be filled as default values. currently support "zero", "int_one", + * "float_one", "int32_min", "float_min", "int32_max" and "float_max". supported src_prc and dst_prc pairs are as + * below(x indicate for support): FP32 I32 I16 U16 I8 U8 BF16 --> src_prc FP32 x x x x + * x x x I32 x x x x x x x I16 x U16 x I8 x U8 + * x BF16 x + * | + * \|/ + * dst_prc + */ // offset in load emitter is the offset of src gpr register, should be parsed from in_idxs. - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; size_t get_inputs_num() const override; private: template - void emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_idx, const int offset) const; + void emit_isa(const Xbyak::Reg64& reg_src, const int out_vec_idx, const int offset) const; template - void load_bytes(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, int load_size) const; + void load_bytes(const Vmm& vmm, const Xbyak::Reg64& reg, int offset, int load_size) const; template - void load_bytes_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, bool is_signed, int load_size) const; + void load_bytes_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + bool is_signed, + int load_size) const; template - void load_words_to_dword_extension(const Vmm &vmm, const Xbyak::Reg64 ®, int offset, ov::element::Type prc, int load_size) const; + void load_words_to_dword_extension(const Vmm& vmm, + const Xbyak::Reg64& reg, + int offset, + ov::element::Type prc, + int load_size) const; template - void fill_with_default(const Vmm &vmm, std::string fill_value, const int &load_num) const; + void fill_with_default(const Vmm& vmm, std::string fill_value, const int& load_num) const; void register_table_entries() override; @@ -104,30 +117,27 @@ class jit_load_emitter : public jit_emitter { class jit_store_emitter : public jit_emitter { public: - jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::element::Type src_prc, ov::element::Type dst_prc, int store_num, + jit_store_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + ov::element::Type src_prc, + ov::element::Type dst_prc, + int store_num, arithmetic_mode mode = arithmetic_mode::saturation, ov::element::Type exec_prc = ov::element::f32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr); /** - * store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, where offset_byte is in_idxs[1] - * supported src_prc and dst_prc pairs are as below(x indicate for support): - * FP32 I32 I16 U16 I8 U8 BF16 --> src_prc - * FP32 x x - * I32 x x - * I16 x x x - * U16 x x x - * I8 x x x - * U8 x x x - * BF16 x* x* x - * \|/ - * dst_prc - * note: FP32/I32-->BF16(x*) is supported only on at least avx512-core plateform - */ + * store_num values with src_prc in Vmm[in_vec_idx] is stored to ptr[reg_dst + offset_byte] address as dst_prc data, + * where offset_byte is in_idxs[1] supported src_prc and dst_prc pairs are as below(x indicate for support): FP32 + * I32 I16 U16 I8 U8 BF16 --> src_prc FP32 x x I32 x x I16 x x x U16 x x + * x I8 x x x U8 x x x BF16 x* x* x + * \|/ + * dst_prc + * note: FP32/I32-->BF16(x*) is supported only on at least avx512-core plateform + */ // offset in store emitter is the offset of dst gpr register, should be parsed from out_idxs. - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; size_t get_inputs_num() const override; @@ -139,16 +149,19 @@ class jit_store_emitter : public jit_emitter { private: template - void emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_dst, const int offset) const; + void emit_isa(const int in_vec_idx, const Xbyak::Reg64& reg_dst, const int offset) const; template - void store_bytes(const Xbyak::Reg64 ®, int offset, int store_size) const; + void store_bytes(const Xbyak::Reg64& reg, int offset, int store_size) const; template - void store_dword_to_byte_extension(const Xbyak::Reg64 ®, int offset, bool is_signed, int store_size) const; + void store_dword_to_byte_extension(const Xbyak::Reg64& reg, int offset, bool is_signed, int store_size) const; template - void store_dword_to_word_extension(const Xbyak::Reg64 ®, int offset, ov::element::Type precision, int store_size) const; + void store_dword_to_word_extension(const Xbyak::Reg64& reg, + int offset, + ov::element::Type precision, + int store_size) const; void register_table_entries() override; @@ -176,5 +189,5 @@ class jit_store_emitter : public jit_emitter { mutable int aux_src_idx = 0; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp index ea16122f2f9793..420e9691ebc73c 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.cpp @@ -21,8 +21,21 @@ EmitABIRegSpills::~EmitABIRegSpills() { void EmitABIRegSpills::preamble() { // gprs - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; + Xbyak::Operand gprs_to_save[] = {h->r8, + h->r9, + h->r10, + h->r11, + h->r12, + h->r13, + h->r14, + h->r15, + h->rax, + h->rbx, + h->rcx, + h->rdx, + h->rdi, + h->rsi, + h->rbp}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); h->sub(h->rsp, n_gprs_to_save * gpr_size); @@ -75,8 +88,21 @@ void EmitABIRegSpills::postamble() { } // restore gpr registers - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; + Xbyak::Operand gprs_to_save[] = {h->r8, + h->r9, + h->r10, + h->r11, + h->r12, + h->r13, + h->r14, + h->r15, + h->rax, + h->rbx, + h->rcx, + h->rdx, + h->rdi, + h->rsi, + h->rbp}; size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); for (int i = n_gprs_to_save - 1; i >= 0; --i) h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); @@ -113,13 +139,17 @@ void EmitABIRegSpills::rsp_restore() { cpu_isa_t EmitABIRegSpills::get_isa() { // need preserve based on cpu capability, instead of host isa. // in case there are possibilty that different isa emitters exist in one kernel from perf standpoint in the future. - // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg, ZMM. - // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted. - if (mayiuse(avx512_core)) return avx512_core; - if (mayiuse(avx2)) return avx2; - if (mayiuse(sse41)) return sse41; + // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may + // use avx512 and spoil k-reg, ZMM. do not care about platform w/ avx512_common but w/o avx512_core(knight landing), + // which is obsoleted. + if (mayiuse(avx512_core)) + return avx512_core; + if (mayiuse(avx2)) + return avx2; + if (mayiuse(sse41)) + return sse41; OV_CPU_JIT_EMITTER_THROW("unsupported isa"); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp index 16a66beba7a536..ba956f3375f054 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/utils.hpp @@ -30,11 +30,15 @@ class EmitABIRegSpills { static dnnl::impl::cpu::x64::cpu_isa_t get_isa(); - inline size_t get_max_vecs_count() const { return dnnl::impl::cpu::x64::isa_num_vregs(isa); } - inline size_t get_vec_length() const { return dnnl::impl::cpu::x64::isa_max_vlen(isa); } + inline size_t get_max_vecs_count() const { + return dnnl::impl::cpu::x64::isa_num_vregs(isa); + } + inline size_t get_vec_length() const { + return dnnl::impl::cpu::x64::isa_max_vlen(isa); + } - dnnl::impl::cpu::x64::jit_generator* h {nullptr}; - const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::cpu_isa_t::isa_undef}; + dnnl::impl::cpu::x64::jit_generator* h{nullptr}; + const dnnl::impl::cpu::x64::cpu_isa_t isa{dnnl::impl::cpu::x64::cpu_isa_t::isa_undef}; static constexpr int k_mask_size = 8; static constexpr int k_mask_num = 8; @@ -44,5 +48,5 @@ class EmitABIRegSpills { bool rsp_status = true; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp index 79e8dcafb218f6..cfe03d21eac19e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_kernel_executor_table.hpp @@ -4,34 +4,38 @@ #pragma once -#include "snippets/kernel_executor_table.hpp" #include "cache/multi_cache.h" +#include "snippets/kernel_executor_table.hpp" namespace ov { namespace intel_cpu { -template +template class CPUKernelExecutor : public snippets::KernelExecutor { public: - CPUKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, Conf c) : - snippets::KernelExecutor(std::move(c)), m_kernel_cache(std::move(kernel_cache)) {} + CPUKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, Conf c) + : snippets::KernelExecutor(std::move(c)), + m_kernel_cache(std::move(kernel_cache)) {} - void update_kernel(const Conf& config, std::shared_ptr& kernel) const override final { // NOLINT + void update_kernel(const Conf& config, std::shared_ptr& kernel) const override final { // NOLINT const auto& cache = m_kernel_cache.lock(); OPENVINO_ASSERT(cache, "Invalid kernel cache pointer in CPUKernelExecutor::update_kernel()"); - const auto& lookup_result = cache->getOrCreate(Key(config), - [this](const Key& k) { - return compile_kernel(k.config); - }); + const auto& lookup_result = cache->getOrCreate(Key(config), [this](const Key& k) { + return compile_kernel(k.config); + }); kernel = lookup_result.first; } protected: struct Key { explicit Key(Conf c) : config{std::move(c)} {} - const Conf config; - size_t hash() const { return config.hash(); } - bool operator==(const Key& rhs) const { return config == rhs.config; } + const Conf config; + size_t hash() const { + return config.hash(); + } + bool operator==(const Key& rhs) const { + return config == rhs.config; + } }; /** Compile kernel managed by KernelExecutor instance. Will be called only if Kernel is not found in the cache */ virtual std::shared_ptr compile_kernel(const Conf& c) const = 0; @@ -39,5 +43,5 @@ class CPUKernelExecutor : public snippets::KernelExecutor { ov::intel_cpu::MultiCacheWeakPtr m_kernel_cache; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index b2758735b2d27a..65741d7031d289 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -8,8 +8,8 @@ #include "snippets/utils/utils.hpp" #ifndef OPENVINO_ARCH_ARM64 -#include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" -#include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" +# include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" +# include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif namespace ov { namespace intel_cpu { @@ -21,7 +21,8 @@ const size_t CPURuntimeConfigurator::rank6D = 6; std::string CPURuntimeConfig::to_string() const { std::stringstream out; out << RuntimeConfig::to_string(); - out << "Loop Parameters:" << "\n"; + out << "Loop Parameters:" + << "\n"; for (size_t i = 0; i < loop_args.size(); ++i) { const auto& loop = loop_args[i]; out << "\t[" << i << "] WA: " << loop.m_work_amount << "\n"; @@ -38,8 +39,8 @@ std::string CPURuntimeConfig::to_string() const { } #endif -CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared()) { -} +CPURuntimeConfigurator::CPURuntimeConfigurator() + : ov::snippets::RuntimeConfigurator(std::make_shared()) {} void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { RuntimeConfigurator::initialization(linear_ir); @@ -78,12 +79,14 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::Linea const auto& data_sizes = loop_info->get_data_sizes(); auto& loop_arg = cpu_config->loop_args[idx]; - loop_arg = jit_snippets_call_args::loop_args_t(loop_info->get_work_amount(), loop_info->get_ptr_increments(), loop_info->get_finalization_offsets()); + loop_arg = jit_snippets_call_args::loop_args_t(loop_info->get_work_amount(), + loop_info->get_ptr_increments(), + loop_info->get_finalization_offsets()); for (int64_t i = 0; i < loop_arg.m_num_data_ptrs; ++i) { loop_arg.m_ptr_increments[i] *= (increment * data_sizes[i]); loop_arg.m_finalization_offsets[i] *= data_sizes[i]; } } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 42ce35a3c66c2b..1706670ce870d1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -34,6 +34,7 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { * @param linear_ir LinearIR */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + protected: void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override; @@ -43,5 +44,5 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { static const size_t rank6D; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp index 6f78c43fd54797..ceee57f3c0cd28 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.cpp @@ -3,15 +3,18 @@ // #include "jit_container_emitter.hpp" + #include "emitters/utils.hpp" #include "utils/general_utils.h" namespace ov { namespace intel_cpu { -void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, +void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, + mapping_info& vec_map_pool, snippets::lowered::LinearIR::container& expressions) const { - OV_CPU_JIT_EMITTER_ASSERT(!expressions.empty(), "Cannot map registers when there is no allocated_emitters provided"); + OV_CPU_JIT_EMITTER_ASSERT(!expressions.empty(), + "Cannot map registers when there is no allocated_emitters provided"); auto map_regs = [&](const std::vector& abstract_regs) { std::vector physical_regs = abstract_regs; @@ -19,13 +22,16 @@ void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, m const auto& abstract_reg = abstract_regs[i]; const auto& type = abstract_reg.type; const auto& abstract = abstract_reg.idx; - OV_CPU_JIT_EMITTER_ASSERT(one_of(type, snippets::RegType::gpr, snippets::RegType::vec), "Incorrect reg type detected!"); + OV_CPU_JIT_EMITTER_ASSERT(one_of(type, snippets::RegType::gpr, snippets::RegType::vec), + "Incorrect reg type detected!"); auto& mapping = type == snippets::RegType::gpr ? gpr_map_pool : vec_map_pool; auto& abstract_to_physical = mapping.first; auto& regs_pool = mapping.second; auto& physical = physical_regs[i]; if (abstract_to_physical.count(abstract) == 0) { - OV_CPU_JIT_EMITTER_ASSERT(!regs_pool.empty(), "Cannot map registers for jit_container_emitter: not enough regs in the pool"); + OV_CPU_JIT_EMITTER_ASSERT( + !regs_pool.empty(), + "Cannot map registers for jit_container_emitter: not enough regs in the pool"); physical.idx = regs_pool.back(); regs_pool.pop_back(); abstract_to_physical[abstract] = physical.idx; @@ -48,5 +54,5 @@ void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, m } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp index 2325c6ef1a2eb3..7737e7e1150926 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_container_emitter.hpp @@ -20,8 +20,10 @@ class jit_container_emitter { protected: // maps gpr and vec abstract registers to physical ones. - void map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, snippets::lowered::LinearIR::container& expressions) const; + void map_abstract_registers(mapping_info& gpr_map_pool, + mapping_info& vec_map_pool, + snippets::lowered::LinearIR::container& expressions) const; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp index 48f98c2ffb2450..20e19bcba7e4f4 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.cpp @@ -3,10 +3,11 @@ // #include "jit_snippets_call_args.hpp" -#include "emitters/utils.hpp" #include +#include "emitters/utils.hpp" + namespace ov { namespace intel_cpu { @@ -21,16 +22,19 @@ void jit_snippets_call_args::register_loops(const std::vector& loop std::copy(loops.begin(), loops.end(), loop_args); } -jit_snippets_call_args::loop_args_t::loop_args_t(int64_t work_amount, const std::vector& ptr_increments, +jit_snippets_call_args::loop_args_t::loop_args_t(int64_t work_amount, + const std::vector& ptr_increments, const std::vector& finalization_offsets) : m_work_amount(work_amount) { - OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == finalization_offsets.size(), "Inconsistent sizes of ptr_increments and finalization_offsets"); + OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == finalization_offsets.size(), + "Inconsistent sizes of ptr_increments and finalization_offsets"); m_num_data_ptrs = static_cast(ptr_increments.size()); init_pointers_and_copy_data(m_num_data_ptrs, ptr_increments.data(), finalization_offsets.data()); } jit_snippets_call_args::loop_args_t::loop_args_t(const loop_args_t& other) - : m_work_amount(other.m_work_amount), m_num_data_ptrs(other.m_num_data_ptrs) { + : m_work_amount(other.m_work_amount), + m_num_data_ptrs(other.m_num_data_ptrs) { init_pointers_and_copy_data(m_num_data_ptrs, other.m_ptr_increments, other.m_finalization_offsets); } @@ -44,7 +48,8 @@ jit_snippets_call_args::loop_args_t& jit_snippets_call_args::loop_args_t::operat return *this; } -void jit_snippets_call_args::loop_args_t::init_pointers_and_copy_data(const int64_t num_elements, const int64_t* ptr_increments, +void jit_snippets_call_args::loop_args_t::init_pointers_and_copy_data(const int64_t num_elements, + const int64_t* ptr_increments, const int64_t* finalization_offsets) { const size_t chunk_size = num_elements * sizeof(int64_t); m_ptr_increments = new int64_t[num_elements]; @@ -60,5 +65,5 @@ void swap(jit_snippets_call_args::loop_args_t& first, jit_snippets_call_args::lo std::swap(first.m_finalization_offsets, second.m_finalization_offsets); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp index 027655d493784d..eb74190dd71676 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/jit_snippets_call_args.hpp @@ -4,9 +4,9 @@ #pragma once -#include -#include #include +#include +#include #include "dnnl_types.h" #include "openvino/core/visibility.hpp" @@ -15,12 +15,12 @@ namespace ov { namespace intel_cpu { #if defined(OPENVINO_ARCH_ARM64) -#define SNIPPETS_MAX_DATA_PTR_COUNT 23 +# define SNIPPETS_MAX_DATA_PTR_COUNT 23 #else -#define SNIPPETS_MAX_DATA_PTR_COUNT 11 +# define SNIPPETS_MAX_DATA_PTR_COUNT 11 #endif -#define GET_OFF(field) offsetof(jit_snippets_call_args, field) +#define GET_OFF(field) offsetof(jit_snippets_call_args, field) #define GET_OFF_LOOP_ARGS(field) offsetof(jit_snippets_call_args::loop_args_t, field) struct amx_tile_config_t { @@ -37,9 +37,9 @@ struct jit_snippets_call_args { void register_loops(const std::vector& loops); - const void *src_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; - void *dst_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; - void *buffer_scratchpad_ptr = nullptr; + const void* src_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; + void* dst_ptrs[SNIPPETS_MAX_DATA_PTR_COUNT] = {}; + void* buffer_scratchpad_ptr = nullptr; // Note: Ideally loop_args must be private, since we manage this pointer manually. // However, standard-layout class definition (to use offset_of) requires the same access specifier @@ -51,14 +51,18 @@ struct jit_snippets_call_args { struct jit_snippets_call_args::loop_args_t { loop_args_t() = default; - loop_args_t(int64_t work_amount, const std::vector& ptr_increments, const std::vector& finalization_offsets); + loop_args_t(int64_t work_amount, + const std::vector& ptr_increments, + const std::vector& finalization_offsets); loop_args_t(const loop_args_t& other); ~loop_args_t(); loop_args_t& operator=(loop_args_t other); friend void swap(loop_args_t& first, loop_args_t& second); - void init_pointers_and_copy_data(const int64_t num_elements, const int64_t* ptr_increments, const int64_t* finalization_offsets); + void init_pointers_and_copy_data(const int64_t num_elements, + const int64_t* ptr_increments, + const int64_t* finalization_offsets); int64_t m_work_amount = 0; int64_t m_num_data_ptrs = 0; @@ -71,5 +75,5 @@ struct jit_snippets_compile_args { std::vector exec_domain = {}; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp index b7c51539861ff8..e4c3c40e1d8120 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.cpp @@ -3,7 +3,7 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#include "debug_caps_config.hpp" +# include "debug_caps_config.hpp" namespace ov { namespace intel_cpu { @@ -20,7 +20,7 @@ void SnippetsDebugCapsConfig::readProperties() { enable_segfault_detector = readEnv("OV_CPU_SNIPPETS_SEGFAULT_DETECTOR") ? true : false; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp index 14dcae0ddf0c69..8f01e85063f5e9 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/debug_caps_config.hpp @@ -3,10 +3,10 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#pragma once +# pragma once -#include -#include +# include +# include namespace ov { namespace intel_cpu { @@ -23,7 +23,7 @@ class SnippetsDebugCapsConfig { void readProperties(); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index c210782db8f91c..39e384837856a1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -4,63 +4,61 @@ #include "cpu_generator.hpp" -#include "snippets/snippets_isa.hpp" -#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include +#include "emitters/plugin/x64/jit_conversion_emitters.hpp" +#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" +#include "emitters/plugin/x64/jit_dnnl_ext_emitters.hpp" +#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/cpu_runtime_configurator.hpp" #include "emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp" #include "emitters/snippets/x64/jit_brgemm_emitter.hpp" -#include "emitters/snippets/x64/jit_memory_emitters.hpp" +#include "emitters/snippets/x64/jit_fill_emitter.hpp" +#include "emitters/snippets/x64/jit_horizon_emitter.hpp" #include "emitters/snippets/x64/jit_kernel_emitter.hpp" #include "emitters/snippets/x64/jit_loop_emitters.hpp" +#include "emitters/snippets/x64/jit_memory_emitters.hpp" #include "emitters/snippets/x64/jit_snippets_emitters.hpp" -#include "emitters/snippets/x64/jit_fill_emitter.hpp" -#include "emitters/snippets/x64/jit_horizon_emitter.hpp" -#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" -#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" -#include "emitters/plugin/x64/jit_dnnl_ext_emitters.hpp" -#include "emitters/plugin/x64/jit_conversion_emitters.hpp" - -#include "transformations/snippets/x64/op/load_convert.hpp" -#include "transformations/snippets/x64/op/store_convert.hpp" +#include "snippets/snippets_isa.hpp" +#include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/snippets/common/op/fused_mul_add.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/load_convert.hpp" #include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" -#include "transformations/cpu_opset/common/op/swish_cpu.hpp" +#include "transformations/snippets/x64/op/store_convert.hpp" #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp" -#include -#include "emitters/snippets/cpu_kernel_executor_table.hpp" - #ifdef SNIPPETS_DEBUG_CAPS -#include "emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp" -#include "emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp" -#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" -#include "emitters/snippets/x64/jit_debug_emitter.hpp" -#include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" -#include "emitters/snippets/x64/verbose.hpp" +# include "emitters/snippets/x64/jit_debug_emitter.hpp" +# include "emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp" +# include "emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp" +# include "emitters/snippets/x64/jit_segfault_detector_emitter.hpp" +# include "emitters/snippets/x64/verbose.hpp" +# include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" #endif #ifdef SNIPPETS_LIBXSMM_TPP -#include "transformations/tpp/x64/op/brgemm.hpp" -#include "transformations/tpp/x64/op/eltwise.hpp" -#include "transformations/tpp/x64/op/reduce.hpp" -#include "transformations/tpp/x64/op/modifiers.hpp" -#include "transformations/tpp/x64/op/scalar.hpp" -#include "transformations/tpp/x64/op/equation.hpp" -#include "emitters/tpp/x64/jit_eltwise_emitters.hpp" -#include "emitters/tpp/x64/jit_brgemm_emitter.hpp" -#include "emitters/tpp/x64/jit_scalar_emitter.hpp" -#include "emitters/tpp/x64/jit_equation_emitter.hpp" -#include "emitters/tpp/x64/jit_debug_emitter.hpp" +# include "emitters/tpp/x64/jit_brgemm_emitter.hpp" +# include "emitters/tpp/x64/jit_debug_emitter.hpp" +# include "emitters/tpp/x64/jit_eltwise_emitters.hpp" +# include "emitters/tpp/x64/jit_equation_emitter.hpp" +# include "emitters/tpp/x64/jit_scalar_emitter.hpp" +# include "transformations/tpp/x64/op/brgemm.hpp" +# include "transformations/tpp/x64/op/eltwise.hpp" +# include "transformations/tpp/x64/op/equation.hpp" +# include "transformations/tpp/x64/op/modifiers.hpp" +# include "transformations/tpp/x64/op/reduce.hpp" +# include "transformations/tpp/x64/op/scalar.hpp" // Note: for reference implementations -#include +# include #endif namespace ov { #ifdef SNIPPETS_DEBUG_CAPS -static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) { +static bool is_load_emitter(const intel_cpu::jit_emitter* emitter) { bool ret = false; if (dynamic_cast(emitter) || dynamic_cast(emitter)) { @@ -69,7 +67,7 @@ static bool is_load_emitter(const intel_cpu::jit_emitter *emitter) { return ret; } -static bool is_store_emitter(const intel_cpu::jit_emitter *emitter) { +static bool is_store_emitter(const intel_cpu::jit_emitter* emitter) { bool ret = false; if (dynamic_cast(emitter)) { return true; @@ -77,72 +75,82 @@ static bool is_store_emitter(const intel_cpu::jit_emitter *emitter) { return ret; } -static bool is_segfault_detector_emitter(const intel_cpu::jit_emitter *emitter) { +static bool is_segfault_detector_emitter(const intel_cpu::jit_emitter* emitter) { // default active for typical tensor memory access emitters bool ret = false; - ret = is_load_emitter(emitter) || - is_store_emitter(emitter) || - dynamic_cast(emitter) || - dynamic_cast(emitter) || - dynamic_cast(emitter); + ret = is_load_emitter(emitter) || is_store_emitter(emitter) || + dynamic_cast(emitter) || + dynamic_cast(emitter) || + dynamic_cast(emitter); return ret; // use below code to active all emitters for extend usage // return !dynamic_cast(emitter); } -#define CREATE_SNIPPETS_EMITTER(e_type, ...) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - auto emitter = std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ - if (debug_config.enable_segfault_detector && is_segfault_detector_emitter(emitter.get())) { \ - auto segfault_emitter = std::make_shared(h.get(), isa, emitter.get(), \ - is_load_emitter(emitter.get()), is_store_emitter(emitter.get()), expr->get_node()->get_friendly_name()); \ - return std::make_shared(emitter, segfault_emitter, jit_debug_emitter::EmissionLocation::preamble); \ - } else { \ - return emitter; \ - } \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +# define CREATE_SNIPPETS_EMITTER(e_type, ...) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + auto emitter = std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ + if (debug_config.enable_segfault_detector && is_segfault_detector_emitter(emitter.get())) { \ + auto segfault_emitter = \ + std::make_shared(h.get(), \ + isa, \ + emitter.get(), \ + is_load_emitter(emitter.get()), \ + is_store_emitter(emitter.get()), \ + expr->get_node()->get_friendly_name()); \ + return std::make_shared(emitter, \ + segfault_emitter, \ + jit_debug_emitter::EmissionLocation::preamble); \ + } else { \ + return emitter; \ + } \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } #else -#define CREATE_SNIPPETS_EMITTER(e_type, ...) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +# define CREATE_SNIPPETS_EMITTER(e_type, ...) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } #endif -#define CREATE_DEBUG_TPP_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(expr, std::make_shared(h.get(), isa, expr)); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} - +#define CREATE_DEBUG_TPP_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(expr, std::make_shared(h.get(), isa, expr)); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_CPU_EMITTER(e_type) { \ - [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return std::make_shared(h.get(), isa, expr->get_node()); \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return e_type::get_supported_precisions(n); \ - } \ -} +#define CREATE_CPU_EMITTER(e_type) \ + { \ + [this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr->get_node()); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + } \ + } -#define CREATE_UNDEFINED_EMITTER(supported_precisions) { \ - [](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ - return nullptr; \ - }, \ - [](const std::shared_ptr& n) -> std::set> { \ - return supported_precisions; \ - } \ -} +#define CREATE_UNDEFINED_EMITTER(supported_precisions) \ + { \ + [](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return nullptr; \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return supported_precisions; \ + } \ + } class jit_snippet : public dnnl::impl::cpu::x64::jit_generator { public: @@ -157,30 +165,43 @@ class jit_snippet : public dnnl::impl::cpu::x64::jit_generator { intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) - : TargetMachine(std::make_shared()), h(new jit_snippet()), isa(host_isa), compiled_kernel_cache(std::move(cache)) { + : TargetMachine(std::make_shared()), + h(new jit_snippet()), + isa(host_isa), + compiled_kernel_cache(std::move(cache)) { // data movement jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Buffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::VectorBuffer::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); - jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::RankNormalization::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[snippets::op::LoadReshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter); - jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); - jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); + jitters[snippets::op::LoadReshape::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); + jitters[snippets::op::BroadcastLoad::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_broadcast_emitter); + jitters[intel_cpu::LoadConvertSaturation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); + jitters[intel_cpu::LoadConvertTruncation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::Store::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); - jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); - jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); + jitters[intel_cpu::StoreConvertSaturation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); + jitters[intel_cpu::StoreConvertTruncation::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_store_memory_emitter); jitters[snippets::op::Scalar::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_scalar_emitter); - jitters[snippets::op::BroadcastMove::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_broadcast_move_emitter); + jitters[snippets::op::BroadcastMove::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_broadcast_move_emitter); - jitters[snippets::op::ConvertTruncation::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_convert_truncation_emitter); - jitters[snippets::op::ConvertSaturation::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_convert_saturation_emitter); + jitters[snippets::op::ConvertTruncation::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_convert_truncation_emitter); + jitters[snippets::op::ConvertSaturation::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_convert_saturation_emitter); // ternary jitters[op::v1::Select::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_select_emitter); @@ -203,10 +224,12 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[op::v1::Mod::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_mod_emitter); jitters[op::v1::Multiply::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_multiply_emitter); jitters[op::v1::NotEqual::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_not_equal_emitter); - jitters[snippets::op::PowerStatic::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_power_static_emitter); + jitters[snippets::op::PowerStatic::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_power_static_emitter); jitters[op::v1::Power::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_power_dynamic_emitter); jitters[op::v0::PRelu::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_prelu_emitter); - jitters[op::v0::SquaredDifference::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_squared_difference_emitter); + jitters[op::v0::SquaredDifference::get_type_info_static()] = + CREATE_CPU_EMITTER(intel_cpu::jit_squared_difference_emitter); jitters[op::v1::Subtract::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_subtract_emitter); jitters[op::v0::Xor::get_type_info_static()] = CREATE_CPU_EMITTER(intel_cpu::jit_logical_xor_emitter); @@ -235,25 +258,35 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::HorizonMax::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_horizon_emitter); jitters[snippets::op::HorizonSum::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_horizon_emitter); - jitters[snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_static_emitter); - jitters[snippets::op::KernelDynamic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_dynamic_emitter); - jitters[snippets::op::LoopBegin::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_loop_begin_emitter); + jitters[snippets::op::KernelStatic::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_static_emitter); + jitters[snippets::op::KernelDynamic::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_kernel_dynamic_emitter); + jitters[snippets::op::LoopBegin::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_loop_begin_emitter); jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_loop_end_emitter); - // Note: jit_brgemm_emitter and jit_brgemm_copy_b_emitter support runtime recompilation, so their constructor takes additional arguments - jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_emitter, - configurator->get_kernel_executor_table(), - compiled_kernel_cache); - jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_copy_b_emitter, - configurator->get_kernel_executor_table(), - compiled_kernel_cache); + // Note: jit_brgemm_emitter and jit_brgemm_copy_b_emitter support runtime recompilation, so their constructor takes + // additional arguments + jitters[intel_cpu::BrgemmCPU::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_emitter, + configurator->get_kernel_executor_table(), + compiled_kernel_cache); + jitters[intel_cpu::BrgemmCopyB::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_brgemm_copy_b_emitter, + configurator->get_kernel_executor_table(), + compiled_kernel_cache); jitters[snippets::op::ReduceMax::get_type_info_static()] = CREATE_UNDEFINED_EMITTER({{ov::element::f32}}); jitters[snippets::op::ReduceSum::get_type_info_static()] = CREATE_UNDEFINED_EMITTER({{ov::element::f32}}); #ifdef SNIPPETS_DEBUG_CAPS - jitters[snippets::op::PerfCountBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter); - jitters[snippets::op::PerfCountEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter); - jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter); - jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter); + jitters[snippets::op::PerfCountBegin::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_start_emitter); + jitters[snippets::op::PerfCountEnd::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_chrono_end_emitter); + jitters[ov::intel_cpu::PerfCountRdtscBegin::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_start_emitter); + jitters[ov::intel_cpu::PerfCountRdtscEnd::get_type_info_static()] = + CREATE_CPU_EMITTER(ov::intel_cpu::jit_perf_count_rdtsc_end_emitter); #endif #ifdef SNIPPETS_LIBXSMM_TPP @@ -267,8 +300,8 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho // Note: you can register Debug emitter for Unary/Binary operations as shown below: // jitters[intel_cpu::tpp::op::Add::get_type_info_static()] = CREATE_DEBUG_TPP_EMITTER(UnaryEltwiseTppEmitter); // - // Note: you can register Reference emitter for Unary operations using std::function or lambda function as shown below: - // jitters[intel_cpu::tpp::op::Exp::get_type_info_static()] = + // Note: you can register Reference emitter for Unary operations using std::function or lambda function as shown + // below: jitters[intel_cpu::tpp::op::Exp::get_type_info_static()] = // CREATE_SNIPPETS_EMITTER(ReferenceUnaryEltwiseTppEmitter, static_cast(std::exp)); // jitters[intel_cpu::tpp::op::Reciprocal::get_type_info_static()] = // CREATE_SNIPPETS_EMITTER(ReferenceUnaryEltwiseTppEmitter, [](float x){ return 1.f/x; }); @@ -292,10 +325,14 @@ std::shared_ptr intel_cpu::CPUTargetMachine::clone() co size_t intel_cpu::CPUTargetMachine::get_lanes() const { switch (isa) { - case dnnl::impl::cpu::x64::avx2 : return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); - case dnnl::impl::cpu::x64::sse41 : return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); - case dnnl::impl::cpu::x64::avx512_core : return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); - default : OPENVINO_THROW("unknown isa ", isa); + case dnnl::impl::cpu::x64::avx2: + return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); + case dnnl::impl::cpu::x64::sse41: + return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); + case dnnl::impl::cpu::x64::avx512_core: + return dnnl::impl::cpu::x64::cpu_isa_traits::vlen / sizeof(float); + default: + OPENVINO_THROW("unknown isa ", isa); } } @@ -315,13 +352,15 @@ snippets::CompiledSnippetPtr intel_cpu::CPUTargetMachine::get_snippet() { if (h->create_kernel() != dnnl::impl::status::success) { OPENVINO_THROW("Failed to create jit_kernel in get_snippet()"); } - const auto& result = std::make_shared(std::unique_ptr(h.release())); + const auto& result = + std::make_shared(std::unique_ptr(h.release())); // Note that we reset all the generated code, since it was copied into CompiledSnippetCPU h.reset(new jit_snippet()); return result; } -intel_cpu::CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) : h_compiled(std::move(h)) { +intel_cpu::CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) + : h_compiled(std::move(h)) { OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); } @@ -337,15 +376,14 @@ bool intel_cpu::CompiledSnippetCPU::empty() const { return get_code_size() == 0; } -intel_cpu::CPUGenerator::CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa_, ov::intel_cpu::MultiCacheWeakPtr cache) : - Generator(std::make_shared(isa_, std::move(cache))) { -} -intel_cpu::CPUGenerator::CPUGenerator(const std::shared_ptr& target) : Generator(target) { -} +intel_cpu::CPUGenerator::CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t isa_, ov::intel_cpu::MultiCacheWeakPtr cache) + : Generator(std::make_shared(isa_, std::move(cache))) {} +intel_cpu::CPUGenerator::CPUGenerator(const std::shared_ptr& target) : Generator(target) {} std::shared_ptr intel_cpu::CPUGenerator::clone() const { const auto& cpu_target_machine = std::dynamic_pointer_cast(target->clone()); - OPENVINO_ASSERT(cpu_target_machine, "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); + OPENVINO_ASSERT(cpu_target_machine, + "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); return std::make_shared(cpu_target_machine); } @@ -358,12 +396,11 @@ ov::snippets::RegType intel_cpu::CPUGenerator::get_specific_op_out_reg_type(cons #endif std::dynamic_pointer_cast(op)) return ov::snippets::RegType::gpr; - else if ( - std::dynamic_pointer_cast(op) || - std::dynamic_pointer_cast(op)) + else if (std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) return ov::snippets::RegType::vec; else - return ov::snippets::RegType::undefined; + return ov::snippets::RegType::undefined; } bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr& e) const { @@ -383,4 +420,4 @@ bool intel_cpu::CPUGenerator::uses_precompiled_kernel(const std::shared_ptr h_compiled; + public: const uint8_t* get_code() const override; size_t get_code_size() const override; @@ -31,8 +30,7 @@ class CompiledSnippetCPU : public snippets::CompiledSnippet { class CPUTargetMachine : public snippets::TargetMachine { public: - explicit CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, - ov::intel_cpu::MultiCacheWeakPtr); + explicit CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr); std::shared_ptr clone() const override; bool is_supported() const override; snippets::CompiledSnippetPtr get_snippet() override; @@ -60,5 +58,5 @@ class CPUGenerator : public snippets::Generator { bool uses_precompiled_kernel(const std::shared_ptr& emitter) const override; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp index e68ab224407c7b..6df658d8d72d0c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp @@ -4,18 +4,15 @@ #include "jit_brgemm_copy_b_emitter.hpp" +#include +#include + #include "emitters/plugin/x64/utils.hpp" #include "emitters/snippets/x64/utils.hpp" - -#include "snippets/utils/utils.hpp" #include "snippets/lowered/expression.hpp" - +#include "snippets/utils/utils.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" -#include -#include - - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -34,7 +31,9 @@ bool get_is_transposed(const ov::snippets::lowered::ExpressionPtr& expr) { } } // namespace -jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, +jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache) : jit_emitter(h, isa) { @@ -48,29 +47,29 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(0)->get_shape()), "Jit emitter is called when the shapes are unknown"); - const auto& in_subtensor = get_projected_subtensor(expr->get_input_port(0)); - const auto K_blk = *++in_subtensor.rbegin(); - const auto& src_prc = brgemm_repack->get_src_element_type(); const auto& wei_prc = brgemm_repack->get_input_element_type(0); const auto wei_N_blk = brgemm_utils::repacking::compute_inner_n_block(wei_prc); const auto is_transposed = get_is_transposed(expr); - const auto brgemm_type = get_brgemm_type(src_prc, K_blk, is_transposed); + const auto brgemm_type = get_brgemm_type(src_prc, is_transposed); const auto primitive_isa = brgemm_utils::get_primitive_isa(src_prc, with_amx(brgemm_type)); m_with_comp = with_compensations(brgemm_type); BrgemmCopyBKernelConfig kernel_config(src_prc, wei_prc, primitive_isa, m_with_comp, is_transposed, wei_N_blk); - m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + m_kernel_executor = + kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); m_memory_offsets = {brgemm_repack->get_offset_in(), brgemm_repack->get_offset_out()}; - m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), utils::get_buffer_cluster_id(expr->get_output_port(0))}; + m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), + utils::get_buffer_cluster_id(expr->get_output_port(0))}; if (m_with_comp) { m_memory_offsets.push_back(brgemm_repack->get_offset_compensations()); m_buffer_ids.push_back(utils::get_buffer_cluster_id(expr->get_output_port(1))); } } -void jit_brgemm_copy_b_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_brgemm_copy_b_emitter::validate_arguments(const std::vector& in, + const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.size() == 1, "expects 1 input"); OV_CPU_JIT_EMITTER_ASSERT((m_with_comp && out.size() == 2) || (!m_with_comp && out.size() == 1), "expects 2 outputs if there are compensations"); @@ -90,14 +89,20 @@ void jit_brgemm_copy_b_emitter::emit_impl(const std::vector& in, const s // Reserve memory on the stack h->sub(h->rsp, reserved_stack_size); - const bool is_dynamic_case = std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); + const bool is_dynamic_case = + std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); Xbyak::Reg64 aux_reg = is_dynamic_case ? ov::intel_cpu::utils::get_aux_gpr(mem_ptrs_idxs) : Xbyak::Reg64(); - const std::vector args_offsets {GET_OFF_BRGEMM_COPY_B_ARGS(src), GET_OFF_BRGEMM_COPY_B_ARGS(tr_src), GET_OFF_BRGEMM_COPY_B_ARGS(compensation_ptr)}; + const std::vector args_offsets{GET_OFF_BRGEMM_COPY_B_ARGS(src), + GET_OFF_BRGEMM_COPY_B_ARGS(tr_src), + GET_OFF_BRGEMM_COPY_B_ARGS(compensation_ptr)}; const auto& mem_ptrs = ov::intel_cpu::utils::transform_idxs_to_regs(mem_ptrs_idxs); for (size_t i = 0; i < mem_ptrs.size(); i++) { if (ov::snippets::utils::is_dynamic_value(m_memory_offsets[i])) - utils::push_ptr_with_runtime_offset_on_stack(h, args_offsets[i], mem_ptrs[i], aux_reg, + utils::push_ptr_with_runtime_offset_on_stack(h, + args_offsets[i], + mem_ptrs[i], + aux_reg, GET_OFF(buffer_offsets) + m_buffer_ids[i] * sizeof(size_t)); else utils::push_ptr_with_static_offset_on_stack(h, args_offsets[i], mem_ptrs[i], m_memory_offsets[i]); @@ -119,5 +124,5 @@ void jit_brgemm_copy_b_emitter::emit_impl(const std::vector& in, const s spill.postamble(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp index ef53efe6081217..96a80153bba4b6 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.hpp @@ -5,38 +5,40 @@ #pragma once #include "emitters/plugin/x64/jit_emitter.hpp" - #include "kernel_executors/brgemm_copy_b.hpp" - namespace ov { namespace intel_cpu { class jit_brgemm_copy_b_emitter : public jit_emitter { public: - jit_brgemm_copy_b_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_brgemm_copy_b_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache); - size_t get_inputs_num() const override {return 1;} - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::i8}, {element::bf16}, {element::f32}}; } private: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; std::vector m_memory_offsets{}; std::vector m_buffer_ids{}; - std::shared_ptr m_kernel_executor {nullptr}; - bool m_with_comp {false}; + std::shared_ptr m_kernel_executor{nullptr}; + bool m_with_comp{false}; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter *emitter); + friend std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter* emitter); #endif }; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index 057a3687ab8d16..172a1cc0b98284 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -4,11 +4,13 @@ #include "jit_brgemm_emitter.hpp" -#include "transformations/snippets/x64/op/brgemm_cpu.hpp" -#include "snippets/utils/utils.hpp" #include "emitters/plugin/x64/utils.hpp" -#include "utils.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_amx.hpp" +#include "snippets/utils/utils.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" +#include "utils.hpp" using namespace Xbyak; using namespace dnnl::impl; @@ -17,30 +19,40 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, +jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, + cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, - const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache) : - jit_emitter(h, isa) { + const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache) + : jit_emitter(h, isa) { in_out_type_ = emitter_in_out_map::gpr_to_gpr; const auto& brgemm_node = as_type_ptr(expr->get_node()); const auto& brg0Prc = brgemm_node->get_input_element_type(0); const auto& brg1Prc = brgemm_node->get_input_element_type(1); const auto brgemm_type = brgemm_node->get_type(); - BrgemmKernelConfig kernel_config(brg0Prc, brg1Prc, with_amx(brgemm_type), with_compensations(brgemm_type), - brgemm_utils::get_primitive_isa(brg0Prc, with_amx(brgemm_type))); - m_kernel_executor = kernel_table->register_kernel(expr, - compiled_kernel_cache, - kernel_config); + m_is_with_amx = brgemm_utils::with_amx(brgemm_type); + if (m_is_with_amx) { + BrgemmAMXKernelConfig kernel_config(brg0Prc, brg1Prc, brgemm_utils::get_primitive_isa(brg0Prc, true)); + m_kernel_executor = + kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + } else { + BrgemmKernelConfig kernel_config(brg0Prc, + brg1Prc, + with_compensations(brgemm_type), + brgemm_utils::get_primitive_isa(brg0Prc, false)); + m_kernel_executor = + kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + } // Note: even if the Brgemm node is dynamic, the first shapeInfer and RuntimeConfigurator::update() // are performed before the BrgemmKernelExecutor registration. So we have to trigger update() manually // for both static and the 1st dynamic shapes. OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(0)->get_shape()) && - !snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(1)->get_shape()), + !snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(1)->get_shape()), "Jit emitter is called when the shapes are unknown"); m_memory_offsets = {brgemm_node->get_offset_a(), brgemm_node->get_offset_b(), brgemm_node->get_offset_c()}; - m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), utils::get_buffer_cluster_id(expr->get_input_port(1)), + m_buffer_ids = {utils::get_buffer_cluster_id(expr->get_input_port(0)), + utils::get_buffer_cluster_id(expr->get_input_port(1)), utils::get_buffer_cluster_id(expr->get_output_port(0))}; if (with_scratchpad(brgemm_type)) { m_memory_offsets.push_back(brgemm_node->get_offset_scratch()); @@ -48,7 +60,8 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, } } -std::set> jit_brgemm_emitter::get_supported_precisions(const std::shared_ptr& node) { +std::set> jit_brgemm_emitter::get_supported_precisions( + const std::shared_ptr& node) { const auto brgemm = as_type_ptr(node); OV_CPU_JIT_EMITTER_ASSERT(brgemm, "get_supported_precisions() expects BrgemmCPU node"); using brgemm_utils::BRGEMM_TYPE; @@ -71,7 +84,7 @@ std::set> jit_brgemm_emitter::get_supported_precision OV_CPU_JIT_EMITTER_THROW("got BrgemmCPU node with unsupported type"); } -void jit_brgemm_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_brgemm_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(m_memory_offsets.size() == in.size() + 1 && (out.size() == 1), "expects 3 inputs if there are compensations/wsp"); } @@ -82,22 +95,42 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec if (in.size() > 2) mem_ptrs_idxs.emplace_back(in[2]); + if (std::dynamic_pointer_cast(m_kernel_executor)) + emit_call(mem_ptrs_idxs); + else if (std::dynamic_pointer_cast(m_kernel_executor)) + emit_call(mem_ptrs_idxs); + else + OV_CPU_JIT_EMITTER_THROW("uknown execuor type"); +} + +template ::value, bool>::type> +void jit_brgemm_emitter::emit_call(const std::vector& mem_ptrs_idxs) const { EmitABIRegSpills spill(h); spill.preamble(); - h->mov(h->rbp, reinterpret_cast(BrgemmKernelExecutor::execute)); - auto reserved_stack_size = sizeof(BrgemmKernelExecutor::call_args); + h->mov(h->rbp, reinterpret_cast(T::execute)); + auto reserved_stack_size = sizeof(typename T::call_args); // Reserve memory on the stack h->sub(h->rsp, reserved_stack_size); - const bool is_dynamic_case = std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); + const bool is_dynamic_case = + std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); Xbyak::Reg64 aux_reg = is_dynamic_case ? ov::intel_cpu::utils::get_aux_gpr(mem_ptrs_idxs) : Xbyak::Reg64(); - const std::vector brgemm_args_offsets {GET_OFF_BRGEMM_ARGS(A), GET_OFF_BRGEMM_ARGS(B), GET_OFF_BRGEMM_ARGS(C), GET_OFF_BRGEMM_ARGS(scratch)}; +#define GET_OFF_CALL_ARGS(field) offsetof(typename T::call_args, field) + const std::vector brgemm_args_offsets = {GET_OFF_CALL_ARGS(A), + GET_OFF_CALL_ARGS(B), + GET_OFF_CALL_ARGS(C), + GET_OFF_CALL_ARGS(scratch)}; +#undef GET_OFF_CALL_ARGS + const auto& mem_ptrs = utils::transform_idxs_to_regs(mem_ptrs_idxs); for (size_t i = 0; i < mem_ptrs.size(); i++) { if (ov::snippets::utils::is_dynamic_value(m_memory_offsets[i])) - utils::push_ptr_with_runtime_offset_on_stack(h, brgemm_args_offsets[i], mem_ptrs[i], aux_reg, + utils::push_ptr_with_runtime_offset_on_stack(h, + brgemm_args_offsets[i], + mem_ptrs[i], + aux_reg, GET_OFF(buffer_offsets) + m_buffer_ids[i] * sizeof(size_t)); else utils::push_ptr_with_static_offset_on_stack(h, brgemm_args_offsets[i], mem_ptrs[i], m_memory_offsets[i]); @@ -108,8 +141,10 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec h->mov(h->qword[h->rsp + brgemm_args_offsets.back()], reinterpret_cast(nullptr)); // abi_param1 always contains jit_snippets_call_args which has amx tile config for each thread - h->lea(h->r10, h->ptr[abi_param1 + GET_OFF(amx_tile_config)]); - h->mov(h->qword[h->rsp + GET_OFF_BRGEMM_ARGS(amx_tile_config)], h->r10); + if (std::is_same()) { + h->lea(h->r10, h->ptr[abi_param1 + GET_OFF(amx_tile_config)]); + h->mov(h->qword[h->rsp + GET_OFF_BRGEMM_AMX_ARGS(amx_tile_config)], h->r10); + } h->mov(abi_param1, reinterpret_cast(m_kernel_executor.get())); h->mov(abi_param2, h->rsp); @@ -123,5 +158,5 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec spill.postamble(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp index baa6ed95473034..9d072065c0fe52 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp @@ -5,35 +5,46 @@ #pragma once #include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/x64/kernel_executors/brgemm.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_base.hpp" namespace ov { namespace intel_cpu { class jit_brgemm_emitter : public jit_emitter { public: - jit_brgemm_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_brgemm_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr, const snippets::KernelExecutorTablePtr& kernel_table, const ov::intel_cpu::MultiCacheWeakPtr& compiled_kernel_cache); - size_t get_inputs_num() const override { return m_memory_offsets.size() - 1; } - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); + size_t get_inputs_num() const override { + return m_memory_offsets.size() - 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); private: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; - // Note: offsets order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if offset is calculated in runtime + template ::value, bool>::type = true> + void emit_call(const std::vector& mem_ptrs_idxs) const; + + // Note: offsets order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if offset is calculated in + // runtime std::vector m_memory_offsets{}; // Note: cluster ids order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if there is no buffer std::vector m_buffer_ids{}; - std::shared_ptr m_kernel_executor = nullptr; + std::shared_ptr m_kernel_executor = nullptr; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter); + friend std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter* emitter); #endif + + bool m_is_with_amx{false}; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp index 05b9d15786157b..45ebfc83899dba 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.cpp @@ -4,9 +4,11 @@ #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_debug_emitter.hpp" -#include -#include "utils/general_utils.h" +# include "jit_debug_emitter.hpp" + +# include + +# include "utils/general_utils.h" using namespace dnnl::impl::cpu; using namespace dnnl::impl; @@ -27,8 +29,10 @@ size_t jit_debug_emitter::aux_gprs_count() const { return m_target_emitter->aux_gprs_count(); } -void jit_debug_emitter::emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_debug_emitter::emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { m_target_emitter->emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); } @@ -52,12 +56,14 @@ void jit_debug_emitter::register_table_entries() { m_target_emitter->register_table_entries(); } -void jit_debug_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_debug_emitter::emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const { m_target_emitter->emit_impl(in_idxs, out_idxs); } -void jit_debug_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_debug_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { if (m_decorator_emit_loc == EmissionLocation::preamble || m_decorator_emit_loc == EmissionLocation::both) m_decorator_emitter->emit_code(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); @@ -67,7 +73,7 @@ void jit_debug_emitter::emit_code(const std::vector &in_idxs, const std: m_decorator_emitter->emit_code(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp index fe7cc527418587..2591af119cc3b5 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_debug_emitter.hpp @@ -4,29 +4,33 @@ #ifdef SNIPPETS_DEBUG_CAPS -#pragma once - -#include "emitters/plugin/x64/jit_emitter.hpp" +# pragma once +# include "emitters/plugin/x64/jit_emitter.hpp" namespace ov { namespace intel_cpu { class jit_debug_emitter : public jit_emitter { public: - enum class EmissionLocation { - preamble, - postamble, - both - }; - jit_debug_emitter(const std::shared_ptr& target_emitter, const std::shared_ptr& decorator_emitter, const EmissionLocation& loc) - : jit_emitter(target_emitter->h, target_emitter->host_isa_, target_emitter->exec_prc_, target_emitter->in_out_type_), - m_target_emitter(target_emitter), m_decorator_emitter(decorator_emitter), m_decorator_emit_loc(loc) { - prepare_table(); - } - - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + enum class EmissionLocation { preamble, postamble, both }; + jit_debug_emitter(const std::shared_ptr& target_emitter, + const std::shared_ptr& decorator_emitter, + const EmissionLocation& loc) + : jit_emitter(target_emitter->h, + target_emitter->host_isa_, + target_emitter->exec_prc_, + target_emitter->in_out_type_), + m_target_emitter(target_emitter), + m_decorator_emitter(decorator_emitter), + m_decorator_emit_loc(loc) { + prepare_table(); + } + + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; void emit_data() const override; size_t get_inputs_num() const override; @@ -38,10 +42,12 @@ class jit_debug_emitter : public jit_emitter { void prepare_table() override; void register_table_entries() override; - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; - void emitter_preamble(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const override; + void emitter_preamble(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; void emitter_postamble() const override; private: @@ -54,7 +60,7 @@ class jit_debug_emitter : public jit_emitter { EmissionLocation m_decorator_emit_loc; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp index 1c05100317ae5f..687917acbabc5a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.cpp @@ -4,16 +4,15 @@ #include "jit_fill_emitter.hpp" - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; - namespace ov { namespace intel_cpu { -jit_fill_emitter::jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_fill_emitter::jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) : jit_emitter(h, isa, ov::element::f32, emitter_in_out_map::vec_to_vec) { const auto fill = ov::as_type_ptr(expr->get_node()); @@ -52,9 +51,9 @@ void jit_fill_emitter::emit_impl(const std::vector& in, const std::vecto } template -void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; +void jit_fill_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Vmm src_vmm = Vmm(in[0]); Vmm dst_vmm = Vmm(out[0]); @@ -62,7 +61,8 @@ void jit_fill_emitter::emit_isa(const std::vector &in, const std::vector const size_t supported_et_size = 4; const auto register_capacity = (src_vmm.getBit() / 8) / supported_et_size; if (offset == register_capacity) { - // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be removed from the LIR + // WA: since AssignRegisters doesn't support inplace logic, Fill ops with offset = register_capacity can't be + // removed from the LIR // TODO: when inplace is supported, remove such Fill ops from the LIR and remove this logic. // Ticket: 126270 if (src_vmm.getIdx() != dst_vmm.getIdx()) @@ -105,5 +105,5 @@ void jit_fill_emitter::fill_tail(const Vmm& src_vmm, const Vmm& dst_vmm) const { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp index 79e9a0e4027a5d..23b929cc161ca7 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_fill_emitter.hpp @@ -6,15 +6,18 @@ #include "emitters/plugin/x64/jit_emitter.hpp" - namespace ov { namespace intel_cpu { class jit_fill_emitter : public jit_emitter { public: - jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_fill_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } protected: size_t aux_gprs_count() const override; @@ -23,18 +26,22 @@ class jit_fill_emitter : public jit_emitter { void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; template void fill_full(const Vmm& vmm_dst) const; template void fill_tail(const Vmm& vmm_src, const Vmm& vmm_dst) const; - bool is_full_reg() const { return offset == 0; } - bool is_optimized() const { return is_full_reg() && fill_value == uint32_t(0x0); } + bool is_full_reg() const { + return offset == 0; + } + bool is_optimized() const { + return is_full_reg() && fill_value == uint32_t(0x0); + } size_t offset = 0; uint32_t fill_value = 0x0; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp index a4f5cbe16d7e1f..34e9c2f71fd148 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.cpp @@ -4,7 +4,6 @@ #include "jit_horizon_emitter.hpp" - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -12,7 +11,8 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -jit_horizon_emitter::jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_horizon_emitter::jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) : jit_emitter(h, isa, ov::element::f32, emitter_in_out_map::vec_to_vec) { if (ov::is_type(expr->get_node())) { @@ -24,8 +24,7 @@ jit_horizon_emitter::jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, } } -void jit_horizon_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_horizon_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::x64::sse41) { emit_isa(in, out); } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { @@ -38,9 +37,12 @@ void jit_horizon_emitter::emit_impl(const std::vector& in, } template -void jit_horizon_emitter::emit_isa(const std::vector &in, const std::vector &out) const { +void jit_horizon_emitter::emit_isa(const std::vector& in, const std::vector& out) const { using Vmm = typename dnnl::impl::utils::conditional3::type; + Xbyak::Xmm, + isa == dnnl::impl::cpu::x64::avx2, + Xbyak::Ymm, + Xbyak::Zmm>::type; Vmm src_vmm = Vmm(in[0]); Vmm dst_vmm = Vmm(out[0]); @@ -67,19 +69,19 @@ void jit_horizon_emitter::emit_isa(const std::vector &in, const std::vec perform_op(dst_vmm, dst_vmm, aux_vmm); } -template -void jit_horizon_emitter::perform_op(const Vmm &vmm1, const Vmm &vmm2, const Vmm &vmm3) const { +template +void jit_horizon_emitter::perform_op(const Vmm& vmm1, const Vmm& vmm2, const Vmm& vmm3) const { switch (m_op_type) { - case OpType::max: - h->uni_vmaxps(vmm1, vmm2, vmm3); - break; - case OpType::sum: - h->uni_vaddps(vmm1, vmm2, vmm3); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Unsupported horizontal operation."); + case OpType::max: + h->uni_vmaxps(vmm1, vmm2, vmm3); + break; + case OpType::sum: + h->uni_vaddps(vmm1, vmm2, vmm3); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported horizontal operation."); } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp index 1b222cb2a86776..df74b2ad9783a4 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_horizon_emitter.hpp @@ -6,34 +6,40 @@ #include "emitters/plugin/x64/jit_emitter.hpp" - namespace ov { namespace intel_cpu { class jit_horizon_emitter : public jit_emitter { public: - jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_horizon_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} - static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr) { + size_t get_inputs_num() const override { + return 1; + } + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr) { return {{element::f32}}; } protected: - size_t aux_vecs_count() const override {return 1;} + size_t aux_vecs_count() const override { + return 1; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; - template - void perform_op(const Vmm &vmm1, const Vmm &vmm2, const Vmm &vmm3) const; + template + void perform_op(const Vmm& vmm1, const Vmm& vmm2, const Vmm& vmm3) const; enum class OpType { max, sum }; OpType m_op_type = OpType::max; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp index 476123355abe70..bd5a3227e1e125 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_kernel_emitter.cpp @@ -14,8 +14,11 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), reg_runtime_params_idx(abi_param1.getIdx()) { +jit_kernel_emitter::jit_kernel_emitter(jit_generator* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_emitter(h, isa), + reg_runtime_params_idx(abi_param1.getIdx()) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "invoked with invalid op argument"); OV_CPU_JIT_EMITTER_ASSERT(!kernel->region->empty(), "invoked with empty body"); @@ -59,8 +62,12 @@ void jit_kernel_emitter::init_reg_pools(const std::set& gpr_blacklist, c gp_regs_pool[i] = vec_regs_pool[i] = 15 - i; auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { // It's important to keep the order of other elements - pool.erase(std::remove_if(pool.begin(), pool.end(), - [&](size_t x) {return to_remove.count(x) != 0;}), pool.end()); + pool.erase(std::remove_if(pool.begin(), + pool.end(), + [&](size_t x) { + return to_remove.count(x) != 0; + }), + pool.end()); }; // Reserve stack base and pointer for push(...) and pop(...) operations std::set gprs_blacklist_extended{Xbyak::Operand::RSP, Xbyak::Operand::RBP}; @@ -70,25 +77,31 @@ void jit_kernel_emitter::init_reg_pools(const std::set& gpr_blacklist, c remove_regs_from_pool(vec_regs_pool, vec_blacklist); } -void jit_kernel_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_kernel_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); emit_impl(in, out); } -void jit_kernel_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_kernel_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty() && out.empty(), ": expects 0 registers on input and output"); const auto num_params = num_inputs + num_outputs + num_unique_buffers; // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount OV_CPU_JIT_EMITTER_ASSERT(data_ptr_regs_idx.size() == num_params, - "number of inputs and outputs is inconsistent with the number of allocated registers ", num_params, - " data_ptr_regs_idx.size() = ", data_ptr_regs_idx.size()); + "number of inputs and outputs is inconsistent with the number of allocated registers ", + num_params, + " data_ptr_regs_idx.size() = ", + data_ptr_regs_idx.size()); } void jit_kernel_emitter::init_body_regs(const std::set& kernel_regs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) { + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) { // Initialize pools of gp and vec registers - // Reserve kernel regs (abi_param1 and, if there is, abi_param2), since they'll be used to pass runtime call args to kernel + // Reserve kernel regs (abi_param1 and, if there is, abi_param2), since they'll be used to pass runtime call args to + // kernel init_reg_pools(kernel_regs, {}); mapping_info gpr_map_pool({}, gp_regs_pool); @@ -122,9 +135,11 @@ void jit_kernel_emitter::emit_impl(const std::vector& in, const std::vec h->postamble(); } -jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_kernel_static_emitter::jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_kernel_emitter(h, isa, expr), reg_indexes_idx(abi_param2.getIdx()) { + : jit_kernel_emitter(h, isa, expr), + reg_indexes_idx(abi_param2.getIdx()) { const auto kernel = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "expectes KernelStatic expression"); jcp = *reinterpret_cast(kernel->compile_params); @@ -158,12 +173,12 @@ void jit_kernel_static_emitter::init_data_pointers(const std::vector(*spare_corruptable_gpr)); + Reg64 reg_tmp = + last_iter_explicitly ? data_ptr_regs[num_params - 1] : Reg64(static_cast(*spare_corruptable_gpr)); // Vector "data_ptr_regs" is sorted by abstract regs. // It means that the vector contains the physical registers in order [src, .., src, dst, .., dst, buffer] // So we can initialize buffer register firstly as last value of vector "data_ptr_regs" @@ -193,13 +208,15 @@ void jit_kernel_static_emitter::init_data_pointers(const std::vector(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(kernel, "expectes KernelDynamic expression"); - // - Reserve abi_param1, since it wll be used to pass runtime call args to all dynamic emitters that needs runtime args + // - Reserve abi_param1, since it wll be used to pass runtime call args to all dynamic emitters that needs runtime + // args // - We cannot assign this register to the body emitters since runtime params MUST be valid during whole execution // for all dynamic emitters init_body_regs({reg_runtime_params_idx}); @@ -220,5 +237,5 @@ void jit_kernel_dynamic_emitter::init_data_pointers(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + jit_kernel_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 0; + } + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: void validate_arguments(const std::vector& in, const std::vector& out) const override; - void init_body_regs(const std::set& kernel_regs, const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}); + void init_body_regs(const std::set& kernel_regs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}); /** - * @brief populates physical registers pools for x86 (both vec and gp). + * @brief populates physical registers pools for x86 (both vec and gp). * Skips stack-related gprs and extra gprs passed as arguments. * @arg gpr_blacklist - set of gp registers that should not be added to register pool * @arg vec_blacklist - set of vec registers should not be added to register pool - */ + */ void init_reg_pools(const std::set& gpr_blacklist, const std::set& vec_blacklist); virtual void init_data_pointers(const std::vector& data_ptr_regs) const = 0; @@ -70,13 +77,15 @@ class jit_kernel_emitter : public jit_emitter, public jit_container_emitter { std::shared_ptr body; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter *emitter); + friend std::string init_info_jit_kernel_emitter(const jit_kernel_emitter* emitter); #endif }; class jit_kernel_static_emitter : public jit_kernel_emitter { public: - jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_kernel_static_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; @@ -86,21 +95,23 @@ class jit_kernel_static_emitter : public jit_kernel_emitter { std::vector> data_offsets; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter *emitter); + friend std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* emitter); #endif }; class jit_kernel_dynamic_emitter : public jit_kernel_emitter { public: - jit_kernel_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); + jit_kernel_dynamic_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); private: void init_data_pointers(const std::vector& data_ptr_regs) const override; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter *emitter); + friend std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter* emitter); #endif }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp index f3151d0df4ccb1..86421678a29011 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.cpp @@ -18,8 +18,11 @@ namespace intel_cpu { namespace { class jit_aux_gpr_holder { public: - jit_aux_gpr_holder(dnnl::impl::cpu::x64::jit_generator* host, std::vector& pool_gpr_idxs, const std::vector& used_gpr_idxs) - : m_h(host), m_pool_gpr_idxs(pool_gpr_idxs) { + jit_aux_gpr_holder(dnnl::impl::cpu::x64::jit_generator* host, + std::vector& pool_gpr_idxs, + const std::vector& used_gpr_idxs) + : m_h(host), + m_pool_gpr_idxs(pool_gpr_idxs) { // If the pool is empty, let's manualy allocate the gpr and push original vlaue on stack if (m_pool_gpr_idxs.empty()) { m_aux_gpr_idx = ov::intel_cpu::utils::get_aux_gpr(used_gpr_idxs); @@ -39,21 +42,26 @@ class jit_aux_gpr_holder { } } - const Reg64& get_reg() const { return m_aux_gpr_idx; } + const Reg64& get_reg() const { + return m_aux_gpr_idx; + } private: dnnl::impl::cpu::x64::jit_generator* m_h; std::vector& m_pool_gpr_idxs; - Reg64 m_aux_gpr_idx {}; + Reg64 m_aux_gpr_idx{}; bool m_is_preserved = false; }; } // namespace /* ================== jit_loop_begin_emitter ====================== */ -jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{new Xbyak::Label()}, loop_end_label(nullptr) { + : jit_emitter(h, isa), + loop_begin_label{new Xbyak::Label()}, + loop_end_label(nullptr) { const auto loop_begin = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_begin, "expects LoopBegin expression"); const auto loop_end = loop_begin->get_loop_end(); @@ -65,7 +73,7 @@ jit_loop_begin_emitter::jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generat in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void jit_loop_begin_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_loop_begin_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(in.empty(), "Invalid inputs size: expected 0 got " + std::to_string(in.size())); // Note: the only expected output is work amount register (communicated to jit_loop_end_emitter) OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Invalid outputs size: expected 1 got " + std::to_string(out.size())); @@ -74,21 +82,24 @@ void jit_loop_begin_emitter::validate_arguments(const std::vector &in, c "loop increment might be dynamic only if loop evaluates once!"); } -void jit_loop_begin_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_begin_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); jit_emitter::emit_code(in, out, pool_vec_idxs, pool_gpr_idxs); } void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std::vector& out) const { // If the loop evaulate once, we can skip loop begin code emission - // If work_amount is dynamic, we should get runtime `work_amount` - it might be `zero` and we should skip loop evaluation + // If work_amount is dynamic, we should get runtime `work_amount` - it might be `zero` and we should skip loop + // evaluation if (evaluate_once && !is_work_amount_dynamic) return; Reg64 reg_work_amount = Reg64(static_cast(out.back())); if (is_work_amount_dynamic) { - jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, out); // loop_begin has only output registers + jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, out); // loop_begin has only output registers Reg64 reg_loop_args_ptr = gpr_holder.get_reg(); const auto id_offset = loop_id * sizeof(jit_snippets_call_args::loop_args_t); h->mov(reg_loop_args_ptr, h->ptr[abi_param1 + GET_OFF(loop_args)]); @@ -113,9 +124,12 @@ void jit_loop_begin_emitter::emit_impl(const std::vector& in, const std: /* ================== jit_loop_end_emitter ====================== */ -jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, +jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr) - : jit_emitter(h, isa), loop_begin_label{nullptr}, loop_end_label{new Xbyak::Label()} { + : jit_emitter(h, isa), + loop_begin_label{nullptr}, + loop_end_label{new Xbyak::Label()} { in_out_type_ = emitter_in_out_map::gpr_to_gpr; const auto loop_end = ov::as_type_ptr(expr->get_node()); OV_CPU_JIT_EMITTER_ASSERT(loop_end != nullptr, "expected LoopEnd expr"); @@ -132,8 +146,9 @@ jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* are_ptr_increments_dynamic = std::any_of(ptr_increments.cbegin(), ptr_increments.cend(), ov::snippets::utils::is_dynamic_value); - are_final_offsets_dynamic = - std::any_of(finalization_offsets.cbegin(), finalization_offsets.cend(), ov::snippets::utils::is_dynamic_value); + are_final_offsets_dynamic = std::any_of(finalization_offsets.cbegin(), + finalization_offsets.cend(), + ov::snippets::utils::is_dynamic_value); are_ptr_shifts_dynamic = are_ptr_increments_dynamic || are_final_offsets_dynamic; const auto begin_expr = get_loop_begin_expr(expr); @@ -143,29 +158,51 @@ jit_loop_end_emitter::jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* loop_begin_label = loop_begin_emitter->get_begin_label(); } -ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr) { +ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr( + const ov::snippets::lowered::ExpressionPtr& expr) { const auto begin_expr = expr->get_input_port_connectors().back()->get_source().get_expr(); OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(begin_expr->get_node()), "LoopEnd expression must have th last port connector to LoopBegin"); return begin_expr; } -void jit_loop_end_emitter::validate_arguments(const std::vector &in, const std::vector &out) const { +void jit_loop_end_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { const auto io_size = num_inputs + num_outputs; OV_CPU_JIT_EMITTER_ASSERT(out.size() == 0, "Invalid number of out arguments: expected ", 0, " got ", out.size()); - OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, "Invalid number of in arguments: expected ", io_size + 1, " got ", in.size()); - OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, "Invalid is_incremented size: expected ", io_size, " got ", is_incremented.size()); - OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, "Invalid ptr_increments size: expected ", io_size, " got ", ptr_increments.size()); + OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, + "Invalid number of in arguments: expected ", + io_size + 1, + " got ", + in.size()); + OV_CPU_JIT_EMITTER_ASSERT(is_incremented.size() == io_size, + "Invalid is_incremented size: expected ", + io_size, + " got ", + is_incremented.size()); + OV_CPU_JIT_EMITTER_ASSERT(ptr_increments.size() == io_size, + "Invalid ptr_increments size: expected ", + io_size, + " got ", + ptr_increments.size()); OV_CPU_JIT_EMITTER_ASSERT(finalization_offsets.size() == io_size, - "Invalid finalization_offsets size: expected: ", io_size, " got ", finalization_offsets.size()); - OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, "Invalid data_sizes size: expected: ", io_size, " got ", data_sizes.size()); + "Invalid finalization_offsets size: expected: ", + io_size, + " got ", + finalization_offsets.size()); + OV_CPU_JIT_EMITTER_ASSERT(data_sizes.size() == io_size, + "Invalid data_sizes size: expected: ", + io_size, + " got ", + data_sizes.size()); OV_CPU_JIT_EMITTER_ASSERT(loop_end_label != nullptr && loop_begin_label != nullptr, "has not inited labels!"); OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_value(wa_increment) || evaluate_once, "loop increment might be dynamic only if loop evaluates once!"); } -void jit_loop_end_emitter::emit_code(const std::vector &in, const std::vector &out, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_loop_end_emitter::emit_code(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { validate_arguments(in, out); jit_emitter::emit_code(in, out, pool_vec_idxs, pool_gpr_idxs); } @@ -176,34 +213,38 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v data_ptr_reg_idxs.reserve(num_inputs + num_outputs); std::copy(in.begin(), in.end() - 1, std::back_inserter(data_ptr_reg_idxs)); - auto apply_increments = [&](bool use_runtime_args, size_t field_offset, const std::vector& increments, size_t scale) { - Reg64 reg_increments; - auto add_increments = [&]() { - for (size_t idx = 0; idx < data_ptr_reg_idxs.size(); idx++) { - const auto& increment = increments[idx]; - if (is_incremented[idx] && increment != 0) { - if (ov::snippets::utils::is_dynamic_value(increment)) { - OV_CPU_JIT_EMITTER_ASSERT(use_runtime_args, "Loop argument structure cannot be pushed to aux GPR"); - h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), h->ptr[reg_increments + idx * sizeof(int64_t)]); - } else { - h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), increment * scale * data_sizes[idx]); + auto apply_increments = + [&](bool use_runtime_args, size_t field_offset, const std::vector& increments, size_t scale) { + Reg64 reg_increments; + auto add_increments = [&]() { + for (size_t idx = 0; idx < data_ptr_reg_idxs.size(); idx++) { + const auto& increment = increments[idx]; + if (is_incremented[idx] && increment != 0) { + if (ov::snippets::utils::is_dynamic_value(increment)) { + OV_CPU_JIT_EMITTER_ASSERT(use_runtime_args, + "Loop argument structure cannot be pushed to aux GPR"); + h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), + h->ptr[reg_increments + idx * sizeof(int64_t)]); + } else { + h->add(Reg64(static_cast(data_ptr_reg_idxs[idx])), + increment * scale * data_sizes[idx]); + } } } + }; + + const auto id_offset = loop_id * sizeof(jit_snippets_call_args::loop_args_t); + if (use_runtime_args) { + jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, in); // loop_end has only input registers + reg_increments = gpr_holder.get_reg(); + h->mov(reg_increments, h->ptr[abi_param1 + GET_OFF(loop_args)]); + h->mov(reg_increments, h->ptr[reg_increments + id_offset + field_offset]); + add_increments(); + } else { + add_increments(); } }; - const auto id_offset = loop_id * sizeof(jit_snippets_call_args::loop_args_t); - if (use_runtime_args) { - jit_aux_gpr_holder gpr_holder(h, aux_gpr_idxs, in); // loop_end has only input registers - reg_increments = gpr_holder.get_reg(); - h->mov(reg_increments, h->ptr[abi_param1 + GET_OFF(loop_args)]); - h->mov(reg_increments, h->ptr[reg_increments + id_offset + field_offset]); - add_increments(); - } else { - add_increments(); - } - }; - if (!evaluate_once) { apply_increments(are_ptr_increments_dynamic, GET_OFF_LOOP_ARGS(m_ptr_increments), ptr_increments, wa_increment); @@ -220,5 +261,5 @@ void jit_loop_end_emitter::emit_impl(const std::vector& in, const std::v /* ============================================================== */ -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp index 262bba39b7d74c..c0a2b53b100c62 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_emitters.hpp @@ -5,7 +5,6 @@ #pragma once #include "emitters/plugin/x64/jit_emitter.hpp" - #include "snippets/op/loop.hpp" #include "snippets/utils/utils.hpp" @@ -14,25 +13,36 @@ namespace intel_cpu { /* ================== jit_loop_begin_emitter ====================== */ -class jit_loop_begin_emitter: public jit_emitter { +class jit_loop_begin_emitter : public jit_emitter { public: - jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_loop_begin_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override { return 0; } + size_t get_inputs_num() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; - void set_loop_end_label(const std::shared_ptr& label) { loop_end_label = label; } - std::shared_ptr get_begin_label() { return loop_begin_label; } + void set_loop_end_label(const std::shared_ptr& label) { + loop_end_label = label; + } + std::shared_ptr get_begin_label() { + return loop_begin_label; + } protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; // `jit_loop_begin_emitter` handles manually aux_gpr allocation using `jit_aux_gpr_holder` - size_t aux_gprs_count() const override { return 0; } + size_t aux_gprs_count() const override { + return 0; + } std::shared_ptr loop_begin_label = nullptr; std::shared_ptr loop_end_label = nullptr; @@ -43,27 +53,33 @@ class jit_loop_begin_emitter: public jit_emitter { bool is_work_amount_dynamic = false; }; - /* ============================================================== */ /* ================== jit_loop_end_emitter ====================== */ -class jit_loop_end_emitter: public jit_emitter { +class jit_loop_end_emitter : public jit_emitter { public: - jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr); + jit_loop_end_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override { return 0; } + size_t get_inputs_num() const override { + return 0; + } - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: - void validate_arguments(const std::vector &in, const std::vector &out) const override; + void validate_arguments(const std::vector& in, const std::vector& out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; // `jit_loop_end_emitter` handles manually aux_gpr allocation using `jit_aux_gpr_holder` - size_t aux_gprs_count() const override { return 0; } + size_t aux_gprs_count() const override { + return 0; + } static ov::snippets::lowered::ExpressionPtr get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr); @@ -86,5 +102,5 @@ class jit_loop_end_emitter: public jit_emitter { /* ============================================================== */ -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp index b7a5fc2e993398..307ef63a8e6a2e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp @@ -5,10 +5,9 @@ #include "jit_memory_emitters.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "snippets/op/buffer.hpp" #include "transformations/snippets/x64/op/load_convert.hpp" #include "transformations/snippets/x64/op/store_convert.hpp" -#include "snippets/op/buffer.hpp" - using namespace Xbyak; using namespace dnnl::impl; @@ -21,7 +20,10 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr, emitter_in_out_map in_out_type) +jit_memory_emitter::jit_memory_emitter(jit_generator* h, + cpu_isa_t isa, + const ExpressionPtr& expr, + emitter_in_out_map in_out_type) : jit_emitter(h, isa) { in_out_type_ = in_out_type; @@ -36,7 +38,8 @@ jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const Ex compiled_byte_offset = memory_access->get_input_offset(); buffer_cluster_id = get_parent_buffer_cluster_id(expr); } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { - OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), "must be output port - memory access"); + OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), + "must be output port - memory access"); count = memory_access->get_output_count(); compiled_byte_offset = memory_access->get_output_offset(); buffer_cluster_id = get_consumer_buffer_cluster_id(expr); @@ -46,7 +49,8 @@ jit_memory_emitter::jit_memory_emitter(jit_generator* h, cpu_isa_t isa, const Ex if (ov::snippets::utils::is_dynamic_value(compiled_byte_offset)) { is_offset_runtime = true; - // Compiled byte offset is zero to manually `add` runtime offset before operation and `sub` after to reset pointer in the register + // Compiled byte offset is zero to manually `add` runtime offset before operation and `sub` after to reset + // pointer in the register compiled_byte_offset = 0; OV_CPU_JIT_EMITTER_ASSERT(buffer_cluster_id != SIZE_MAX, "Incorrect buffer offset in call_args"); } @@ -84,8 +88,10 @@ std::vector jit_memory_emitter::get_available_aux_gprs() const { return available_aux_gprs; } -void jit_memory_emitter::emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs) const { +void jit_memory_emitter::emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs); Reg64 reg_runtime_params = abi_param1; // defined by jit_kernel_emitter @@ -152,19 +158,26 @@ void jit_load_broadcast_emitter::emit_impl(const std::vector& in, const } template -void jit_load_broadcast_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; +void jit_load_broadcast_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Reg64 in_reg(in[0]); Vmm vmm_dst = Vmm(out[0]); - // It doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for `BroadcastLoad`, - // key point here is not to add post-increment, it might be fixed by some other approach in future + // It doesn't really matter if we broadcast or `movss` for vector tails so keep only one version for + // `BroadcastLoad`, key point here is not to add post-increment, it might be fixed by some other approach in future switch (src_prc.size()) { - case 4: h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; - case 2: h->vpbroadcastw(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; - case 1: h->vpbroadcastb(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported data type"); + case 4: + h->uni_vbroadcastss(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); + break; + case 2: + h->vpbroadcastw(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); + break; + case 1: + h->vpbroadcastb(vmm_dst, h->ptr[in_reg + compiled_byte_offset]); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported data type"); } } @@ -190,5 +203,5 @@ void jit_store_memory_emitter::emit_data() const { store_emitter->emit_data(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp index 55a41c977dd67c..d21e85d53e7193 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp @@ -7,17 +7,20 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/plugin/x64/jit_load_store_emitters.hpp" - namespace ov { namespace intel_cpu { -class jit_memory_emitter : public jit_emitter { +class jit_memory_emitter : public jit_emitter { public: - jit_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr, emitter_in_out_map in_out_type); + jit_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, + emitter_in_out_map in_out_type); - void emit_code(const std::vector &in_idxs, const std::vector &out_idxs, - const std::vector &pool_vec_idxs = {}, const std::vector &pool_gpr_idxs = {}) const override; + void emit_code(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs = {}, + const std::vector& pool_gpr_idxs = {}) const override; protected: static size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); @@ -36,16 +39,19 @@ class jit_memory_emitter : public jit_emitter { bool is_offset_runtime = false; #ifdef SNIPPETS_DEBUG_CAPS - friend std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter); + friend std::string init_info_jit_memory_emitter(const jit_memory_emitter* emitter); #endif }; class jit_load_memory_emitter : public jit_memory_emitter { public: - jit_load_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_load_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 0;} + size_t get_inputs_num() const override { + return 0; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; @@ -58,24 +64,30 @@ class jit_load_memory_emitter : public jit_memory_emitter { class jit_load_broadcast_emitter : public jit_memory_emitter { public: - jit_load_broadcast_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_load_broadcast_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 0;} + size_t get_inputs_num() const override { + return 0; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; }; -class jit_store_memory_emitter : public jit_memory_emitter { +class jit_store_memory_emitter : public jit_memory_emitter { public: - jit_store_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_store_memory_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; @@ -86,5 +98,5 @@ class jit_store_memory_emitter : public jit_memory_emitter { std::unique_ptr store_emitter = nullptr; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp index f89e906ce57593..ccb4da742e38d6 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.cpp @@ -3,9 +3,9 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_perf_count_chrono_emitters.hpp" +# include "jit_perf_count_chrono_emitters.hpp" -#include "emitters/plugin/x64/utils.hpp" +# include "emitters/plugin/x64/utils.hpp" using namespace dnnl::impl; using namespace dnnl::impl::utils; @@ -17,8 +17,10 @@ using namespace Xbyak::util; namespace ov { namespace intel_cpu { -jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { +jit_perf_count_chrono_start_emitter::jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { m_start_node = ov::as_type_ptr(n); } @@ -30,11 +32,12 @@ void jit_perf_count_chrono_start_emitter::set_start_time(snippets::op::PerfCount start_node->set_start_time(); } -void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { EmitABIRegSpills spill(h); spill.preamble(); - const auto &set_start_time_overload = static_cast(set_start_time); + const auto& set_start_time_overload = static_cast(set_start_time); h->mov(h->rax, reinterpret_cast(set_start_time_overload)); h->mov(abi_param1, reinterpret_cast(m_start_node.get())); @@ -46,8 +49,10 @@ void jit_perf_count_chrono_start_emitter::emit_impl(const std::vector &i } ///////////////////jit_perf_count_chrono_end_emitter//////////////////////////////////// -jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { +jit_perf_count_chrono_end_emitter::jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { m_end_node = ov::as_type_ptr(n); } @@ -59,11 +64,13 @@ void jit_perf_count_chrono_end_emitter::set_accumulated_time(snippets::op::PerfC end_node->set_accumulated_time(); } -void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { EmitABIRegSpills spill(h); spill.preamble(); - const auto &set_accumulated_time_overload = static_cast(set_accumulated_time); + const auto& set_accumulated_time_overload = + static_cast(set_accumulated_time); h->mov(h->rax, reinterpret_cast(set_accumulated_time_overload)); h->mov(abi_param1, reinterpret_cast(m_end_node.get())); @@ -74,6 +81,6 @@ void jit_perf_count_chrono_end_emitter::emit_impl(const std::vector &in_ spill.postamble(); } -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp index e8608afc7f1428..817c0583609778 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_chrono_emitters.hpp @@ -3,24 +3,23 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#pragma once - -#include "emitters/plugin/x64/jit_emitter.hpp" - -#include "snippets/op/perf_count.hpp" +# pragma once +# include "emitters/plugin/x64/jit_emitter.hpp" +# include "snippets/op/perf_count.hpp" namespace ov { namespace intel_cpu { class jit_perf_count_chrono_start_emitter : public jit_emitter { public: - jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_perf_count_chrono_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; static void set_start_time(snippets::op::PerfCountBegin* start_node); std::shared_ptr m_start_node = nullptr; @@ -28,17 +27,18 @@ class jit_perf_count_chrono_start_emitter : public jit_emitter { class jit_perf_count_chrono_end_emitter : public jit_emitter { public: - jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_perf_count_chrono_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; static void set_accumulated_time(snippets::op::PerfCountEnd* end_node); std::shared_ptr m_end_node = nullptr; }; -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp index c469c052ce3ef6..e951f8042ad762 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.cpp @@ -3,7 +3,7 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_perf_count_rdtsc_emitters.hpp" +# include "jit_perf_count_rdtsc_emitters.hpp" using namespace dnnl::impl; using namespace dnnl::impl::utils; @@ -15,8 +15,10 @@ using namespace Xbyak::util; namespace ov { namespace intel_cpu { -jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { +jit_perf_count_rdtsc_start_emitter::jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { m_start_node = ov::as_type_ptr(n); } @@ -24,16 +26,18 @@ size_t jit_perf_count_rdtsc_start_emitter::get_inputs_num() const { return 0; } -void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { h->push(h->rax); h->push(h->rdx); - // The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the low-order 32 bits. + // The EDX register is loaded with the high-order 32 bits of the MSR and the EAX register is loaded with the + // low-order 32 bits. h->lfence(); h->rdtsc(); h->lfence(); - h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32) - h->or_(h->rdx, h->rax); // rdx has current tsc + h->shl(h->rdx, 0x20); // shift to higher half of rdx 0x20(32) + h->or_(h->rdx, h->rax); // rdx has current tsc h->mov(h->rax, reinterpret_cast(&m_start_node->start_count)); h->mov(qword[h->rax], h->rdx); @@ -43,16 +47,19 @@ void jit_perf_count_rdtsc_start_emitter::emit_impl(const std::vector &in } ///////////////////jit_perf_count_rdtsc_end_emitter//////////////////////////////////// -jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n) : jit_emitter(host, host_isa) { - m_end_node = ov::as_type_ptr(n); +jit_perf_count_rdtsc_end_emitter::jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n) + : jit_emitter(host, host_isa) { + m_end_node = ov::as_type_ptr(n); } size_t jit_perf_count_rdtsc_end_emitter::get_inputs_num() const { return 0; } -void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { +void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector& in_idxs, + const std::vector& out_idxs) const { h->push(h->rax); h->push(h->rdx); @@ -79,6 +86,6 @@ void jit_perf_count_rdtsc_end_emitter::emit_impl(const std::vector &in_i h->pop(h->rax); } -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp index c3ae1aac01ab9d..343807bdfcd076 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_perf_count_rdtsc_emitters.hpp @@ -3,40 +3,40 @@ // #ifdef SNIPPETS_DEBUG_CAPS -#pragma once - -#include "emitters/plugin/x64/jit_emitter.hpp" - -#include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" +# pragma once +# include "emitters/plugin/x64/jit_emitter.hpp" +# include "transformations/snippets/x64/op/perf_count_rdtsc.hpp" namespace ov { namespace intel_cpu { class jit_perf_count_rdtsc_start_emitter : public jit_emitter { public: - jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n); + jit_perf_count_rdtsc_start_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; std::shared_ptr m_start_node = nullptr; }; class jit_perf_count_rdtsc_end_emitter : public jit_emitter { public: - jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - const std::shared_ptr& n); + jit_perf_count_rdtsc_end_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + const std::shared_ptr& n); size_t get_inputs_num() const override; private: - void emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const override; + void emit_impl(const std::vector& in_idxs, const std::vector& out_idxs) const override; std::shared_ptr m_end_node = nullptr; }; -} // namespace intel_cpu -} // namespace ov -#endif // SNIPPETS_DEBUG_CAPS +} // namespace intel_cpu +} // namespace ov +#endif // SNIPPETS_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp index f88c345ff055b5..c513e969144d1c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.cpp @@ -4,8 +4,9 @@ #ifdef SNIPPETS_DEBUG_CAPS -#include "jit_segfault_detector_emitter.hpp" -#include "emitters/plugin/x64/utils.hpp" +# include "jit_segfault_detector_emitter.hpp" + +# include "emitters/plugin/x64/utils.hpp" using namespace dnnl::impl::utils; using namespace dnnl::impl; @@ -18,22 +19,28 @@ namespace intel_cpu { std::shared_ptr> g_custom_segfault_handler = std::make_shared>(); -jit_uni_segfault_detector_emitter::jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - jit_emitter* target_emitter, bool is_load, bool is_store, std::string target_node_name) : - jit_emitter(host, host_isa), - m_target_emitter(target_emitter), - is_target_use_load_emitter(is_load), - is_target_use_store_emitter(is_store), - m_target_node_name(target_node_name) { +jit_uni_segfault_detector_emitter::jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_emitter* target_emitter, + bool is_load, + bool is_store, + std::string target_node_name) + : jit_emitter(host, host_isa), + m_target_emitter(target_emitter), + is_target_use_load_emitter(is_load), + is_target_use_store_emitter(is_store), + m_target_node_name(target_node_name) {} + +size_t jit_uni_segfault_detector_emitter::get_inputs_num() const { + return 1; } -size_t jit_uni_segfault_detector_emitter::get_inputs_num() const { return 1; } - const jit_emitter* jit_uni_segfault_detector_emitter::get_target_emitter() const { return m_target_emitter; } -void jit_uni_segfault_detector_emitter::emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const { +void jit_uni_segfault_detector_emitter::emit_impl(const std::vector& in_vec_idxs, + const std::vector& out_vec_idxs) const { save_target_emitter(); if (is_target_use_load_emitter) { memory_track(in_vec_idxs[0]); @@ -47,7 +54,8 @@ void jit_uni_segfault_detector_emitter::save_target_emitter() const { EmitABIRegSpills spill(h); spill.preamble(); - const auto &set_local_handler_overload = static_cast(set_local_handler); + const auto& set_local_handler_overload = + static_cast(set_local_handler); h->mov(h->rax, reinterpret_cast(set_local_handler_overload)); h->mov(abi_param1, reinterpret_cast(this)); @@ -85,7 +93,7 @@ void jit_uni_segfault_detector_emitter::memory_track(size_t gpr_idx_for_mem_addr h->pop(h->r15); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp index 21ffaa84cf3db8..86191ae865fe38 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_segfault_detector_emitter.hpp @@ -4,11 +4,12 @@ #ifdef SNIPPETS_DEBUG_CAPS -#pragma once +# pragma once -#include -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "openvino/runtime/threading/thread_local.hpp" +# include + +# include "emitters/plugin/x64/jit_emitter.hpp" +# include "openvino/runtime/threading/thread_local.hpp" using namespace ov::threading; @@ -20,18 +21,22 @@ extern std::shared_ptr> g_custom class jit_uni_segfault_detector_emitter : public jit_emitter { public: - jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, - jit_emitter* target_emitter, bool is_load, bool is_store, std::string target_node_name); + jit_uni_segfault_detector_emitter(dnnl::impl::cpu::x64::jit_generator* host, + dnnl::impl::cpu::x64::cpu_isa_t host_isa, + jit_emitter* target_emitter, + bool is_load, + bool is_store, + std::string target_node_name); size_t get_inputs_num() const override; const jit_emitter* get_target_emitter() const; private: - // emit code is to save "this" pointer(jit_uni_segfault_detector_emitter) to global handler, then print info w/ it's target_emitter. - // and to save tracked memory address, iteration, etc to print + // emit code is to save "this" pointer(jit_uni_segfault_detector_emitter) to global handler, then print info w/ it's + // target_emitter. and to save tracked memory address, iteration, etc to print void emit_impl(const std::vector& in_vec_idxs, const std::vector& out_vec_idxs) const override; - jit_emitter *m_target_emitter = nullptr; + jit_emitter* m_target_emitter = nullptr; bool is_target_use_load_emitter = false; bool is_target_use_store_emitter = false; std::string m_target_node_name = ""; @@ -44,10 +49,10 @@ class jit_uni_segfault_detector_emitter : public jit_emitter { mutable size_t current_address = 0; mutable size_t iteration = 0; - friend std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter *emitter); + friend std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter* emitter); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp index d8066f9a126543..ba4012de86d83d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.cpp @@ -15,7 +15,10 @@ using jit_generator = dnnl::impl::cpu::x64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; -jit_nop_emitter::jit_nop_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr, emitter_in_out_map emitter_type) +jit_nop_emitter::jit_nop_emitter(jit_generator* h, + cpu_isa_t isa, + const ExpressionPtr& expr, + emitter_in_out_map emitter_type) : jit_emitter(h, isa) { in_out_type_ = emitter_type; } @@ -25,7 +28,8 @@ jit_parameter_emitter::jit_parameter_emitter(jit_generator* h, cpu_isa_t isa, co in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -jit_result_emitter::jit_result_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_nop_emitter(h, isa, expr) { +jit_result_emitter::jit_result_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_nop_emitter(h, isa, expr) { in_out_type_ = emitter_in_out_map::gpr_to_gpr; } @@ -34,14 +38,13 @@ jit_broadcast_move_emitter::jit_broadcast_move_emitter(jit_generator* h, cpu_isa const auto n = expr->get_node(); if (n->get_input_element_type(0) != n->get_output_element_type(0)) OV_CPU_JIT_EMITTER_THROW("supports only equal input and output types but gets: ", - n->get_input_element_type(0), - " and ", - n->get_output_element_type(0)); + n->get_input_element_type(0), + " and ", + n->get_output_element_type(0)); byte_size = n->get_input_element_type(0).size(); } -void jit_broadcast_move_emitter::emit_impl(const std::vector& in, - const std::vector& out) const { +void jit_broadcast_move_emitter::emit_impl(const std::vector& in, const std::vector& out) const { if (host_isa_ == dnnl::impl::cpu::x64::sse41) { emit_isa(in, out); } else if (host_isa_ == dnnl::impl::cpu::x64::avx2) { @@ -54,17 +57,24 @@ void jit_broadcast_move_emitter::emit_impl(const std::vector& in, } template -void jit_broadcast_move_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; +void jit_broadcast_move_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; Xmm xmm_src0 = Xmm(in[0]); - Vmm vmm_dst = Vmm(out[0]); + Vmm vmm_dst = Vmm(out[0]); switch (byte_size) { - case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break; - case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break; - case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break; - default: OV_CPU_JIT_EMITTER_THROW("unsupported data type"); + case 4: + h->uni_vbroadcastss(vmm_dst, xmm_src0); + break; + case 2: + h->vpbroadcastw(vmm_dst, xmm_src0); + break; + case 1: + h->vpbroadcastb(vmm_dst, xmm_src0); + break; + default: + OV_CPU_JIT_EMITTER_THROW("unsupported data type"); } } @@ -74,14 +84,20 @@ int32_t jit_scalar_emitter::read_value(const ov::snippets::lowered::ExpressionPt const auto& precision = n->get_output_element_type(0); int32_t res = INT_MIN; switch (precision) { - case element::i32: res = n->cast_vector(1)[0]; break; - case element::f32: res = dnnl::impl::cpu::x64::float2int(n->cast_vector(1)[0]); break; - default: OV_CPU_JIT_EMITTER_THROW("doesn't support ", precision); + case element::i32: + res = n->cast_vector(1)[0]; + break; + case element::f32: + res = dnnl::impl::cpu::x64::float2int(n->cast_vector(1)[0]); + break; + default: + OV_CPU_JIT_EMITTER_THROW("doesn't support ", precision); } return res; } -jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { +jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_emitter(h, isa) { push_arg_entry_of("scalar", read_value(expr), true); prepare_table(); } @@ -89,21 +105,27 @@ jit_scalar_emitter::jit_scalar_emitter(jit_generator* h, cpu_isa_t isa, const Ex void jit_scalar_emitter::emit_impl(const std::vector& in, const std::vector& out) const { using isa = cpu_isa_t; switch (host_isa_) { - case isa::sse41: emit_isa(in, out); break; - case isa::avx2: emit_isa(in, out); break; - case isa::avx512_core: emit_isa(in, out); break; - default: OV_CPU_JIT_EMITTER_THROW("Unsupported isa ", host_isa_); + case isa::sse41: + emit_isa(in, out); + break; + case isa::avx2: + emit_isa(in, out); + break; + case isa::avx512_core: + emit_isa(in, out); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Unsupported isa ", host_isa_); } } template -void jit_scalar_emitter::emit_isa(const std::vector &in, const std::vector &out) const { - using Vmm = typename dnnl::impl::utils::conditional3::type; - Vmm vmm_dst = Vmm(out[0]); +void jit_scalar_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + using Vmm = typename dnnl::impl::utils:: + conditional3::type; + Vmm vmm_dst = Vmm(out[0]); h->uni_vbroadcastss(vmm_dst, table_val("scalar")); } - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp index c75f071c4ec7e0..6a91e3b7c47d3d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_snippets_emitters.hpp @@ -6,16 +6,19 @@ #include "emitters/plugin/x64/jit_emitter.hpp" - namespace ov { namespace intel_cpu { class jit_nop_emitter : public jit_emitter { public: - jit_nop_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const ov::snippets::lowered::ExpressionPtr& expr, emitter_in_out_map emitter_type = gpr_to_gpr); + jit_nop_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, + emitter_in_out_map emitter_type = gpr_to_gpr); - size_t get_inputs_num() const override {return 0;} + size_t get_inputs_num() const override { + return 0; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override {} @@ -23,31 +26,40 @@ class jit_nop_emitter : public jit_emitter { class jit_parameter_emitter : public jit_nop_emitter { public: - jit_parameter_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_parameter_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override { return 0; } + size_t get_inputs_num() const override { + return 0; + } }; class jit_result_emitter : public jit_nop_emitter { public: - jit_result_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_result_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } }; class jit_broadcast_move_emitter : public jit_emitter { public: - jit_broadcast_move_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_broadcast_move_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 1;} + size_t get_inputs_num() const override { + return 1; + } private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; private: size_t byte_size = 0lu; @@ -55,18 +67,23 @@ class jit_broadcast_move_emitter : public jit_emitter { class jit_scalar_emitter : public jit_emitter { public: - jit_scalar_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + jit_scalar_emitter(dnnl::impl::cpu::x64::jit_generator* h, + dnnl::impl::cpu::x64::cpu_isa_t isa, const ov::snippets::lowered::ExpressionPtr& expr); - size_t get_inputs_num() const override {return 0;} - size_t aux_gprs_count() const override {return 1;} + size_t get_inputs_num() const override { + return 0; + } + size_t aux_gprs_count() const override { + return 1; + } static int32_t read_value(const ov::snippets::lowered::ExpressionPtr& expr); private: void emit_impl(const std::vector& in, const std::vector& out) const override; template - void emit_isa(const std::vector &in, const std::vector &out) const; + void emit_isa(const std::vector& in, const std::vector& out) const; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index fad1be5a5d1289..58a31a1804782a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -1,135 +1,57 @@ -// Copyright (C) 2020-2023 Intel Corporation +// Copyright (C) 2020-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "brgemm.hpp" -#include - #include "common/utils.hpp" #include "dnnl_extension_utils.h" -#include "snippets/lowered/loop_manager.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" -#define DIM_CAST(X) static_cast(X) -#define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) - using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; -namespace { -size_t init_hash(dnnl_data_type_t dt_in0, dnnl_data_type_t dt_in1, bool is_with_amx, - bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t isa) { - size_t seed = 0; -#define HASH(X) seed = hash_combine(seed, X) - HASH(dt_in0); HASH(dt_in1); - HASH(is_with_amx); HASH(is_with_comp); - HASH(isa); -#undef HASH - return seed; -} -} // namespace - namespace ov { namespace intel_cpu { -BrgemmKernelConfig::BrgemmKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, - dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) : - m_static_params(std::make_shared(in0_dtype, in1_dtype, - is_with_amx, is_with_comp, - primitive_isa)) { - m_hash = compute_hash(); -} -bool BrgemmKernelConfig::is_completed() const { - return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty(); -} - -bool BrgemmKernelConfig::operator==(const BrgemmKernelConfig& rhs) const { -#define EQ(X) X == rhs.X - return EQ(m_hash) && EQ(m_beta) && - EQ(m_M) && EQ(m_N) && EQ(m_K) && - EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC) && - (EQ(m_static_params.get()) || *m_static_params == *(rhs.m_static_params)); -#undef EQ -} - -void BrgemmKernelConfig::update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta) { - // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) - // To process this case, we have to make this Config as empty (nullify runtime parameters) - if (utils::one_of(0, M, N, K)) { - m_M = 0; m_N = 0; m_K = 0; - m_LDA = 0; m_LDB = 0; m_LDC = 0; - m_beta = 0; - } else { - m_M = M; m_N = N; m_K = K; - m_LDA = LDA; m_LDB = LDB; m_LDC = LDC; - m_beta = beta; - } +BrgemmKernelConfig::BrgemmKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : BrgemmBaseKernelConfig(), + m_static_params(std::make_shared(in0_dtype, in1_dtype, is_with_comp, primitive_isa)) { m_hash = compute_hash(); } -bool BrgemmKernelConfig::is_empty() const { - return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta); -} - -BrgemmKernelConfig::operator amx_tile_config_t() const { - amx_tile_config_t res; - res.M = m_M; res.N = m_N; res.K = m_K; - return res; -} - -BrgemmKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, - dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) : - dt_in0(DTYPE_CAST(in0_dtype)), dt_in1(DTYPE_CAST(in1_dtype)), - is_with_amx(is_with_amx), is_with_comp(is_with_comp), - isa(primitive_isa), - hash(init_hash(dt_in0, dt_in1, is_with_amx, is_with_comp, isa)) { -} +BrgemmKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : StaticBaseParams(in0_dtype, in1_dtype, primitive_isa, compute_hash(is_with_comp)), + is_with_comp(is_with_comp) {} bool BrgemmKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { -#define EQ(X) X == rhs.X - return EQ(hash) && EQ(dt_in0) && EQ(dt_in1)&& EQ(is_with_amx) && EQ(is_with_comp) && EQ(isa); -#undef EQ + return StaticBaseParams::operator==(rhs) && is_with_comp == rhs.is_with_comp; } -size_t BrgemmKernelConfig::compute_hash() const { - size_t seed = m_static_params->hash; -#define HASH(X) seed = hash_combine(seed, X) - HASH(m_M); HASH(m_N); HASH(m_K); - HASH(m_LDA); HASH(m_LDB); HASH(m_LDC); - HASH(m_beta); -#undef HASH - return seed; + +size_t BrgemmKernelConfig::StaticParams::compute_hash(bool is_with_comp) { + return hash_combine(0, is_with_comp); } #ifdef SNIPPETS_DEBUG_CAPS -#define PRINT(X) ss << #X << " = " << X << "\n" std::string BrgemmKernelConfig::StaticParams::to_string() const { std::stringstream ss; - PRINT(dt_in0); PRINT(dt_in1); - PRINT(is_with_amx); PRINT(is_with_comp); - PRINT(isa); + ss << StaticBaseParams::to_string(); + ss << "is_with_comp = " << is_with_comp << "\n"; return ss.str(); } - -std::string BrgemmKernelConfig::to_string() const { - std::stringstream ss; - ss << m_static_params->to_string() << "\n"; - PRINT(m_M); PRINT(m_N); PRINT(m_K); - PRINT(m_LDA); PRINT(m_LDB); PRINT(m_LDC); - PRINT(m_beta); - return ss.str(); -} -#undef PRINT #endif -BrgemmKernelExecutor::BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) : - CPUKernelExecutor(std::move(kernel_cache), std::move(config)) { } - +BrgemmKernelExecutor::BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) + : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) {} std::shared_ptr BrgemmKernelExecutor::compile_kernel(const BrgemmKernelConfig& config) const { std::shared_ptr compiled_kernel = std::make_shared(); @@ -138,206 +60,54 @@ std::shared_ptr BrgemmKernelExecutor::compile_kernel(const if (config.is_empty()) return compiled_kernel; - cpu::x64::brgemm_desc_t desc; - auto status = brgemm_desc_init(&desc, config.get_isa(), cpu::x64::brgemm_strd, - config.get_dt_in0(), config.get_dt_in1(), - false, false, cpu::x64::brgemm_row_major, 1.f, - config.get_beta(), - config.get_LDA(), config.get_LDB(), config.get_LDC(), - config.get_M(), config.get_N(), config.get_K(), nullptr); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm descriptor due to invalid params"); - - if (config.is_with_amx()) { - status = brgemm_init_tiles(desc, compiled_kernel->palette); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm tiles due to invalid params"); - } - - cpu::x64::brgemm_kernel_t* kernel_ = nullptr; - status = brgemm_kernel_create(&kernel_, desc); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot create brgemm kernel due to invalid params"); - compiled_kernel->compiled_kernel = std::unique_ptr(kernel_); + create_brgemm_kernel(compiled_kernel->brgemm_kernel, + config.get_dt_in0(), + config.get_dt_in1(), + config.get_isa(), + config.get_M(), + config.get_N(), + config.get_K(), + config.get_LDA(), + config.get_LDB(), + config.get_LDC(), + config.get_beta()); return compiled_kernel; } -float BrgemmKernelExecutor::get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, - const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) { - // Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop. - // Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass). - // It means that previous executed Loops have Loop ID less the current Loop ID. - // - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have `beta = 1`. - // - If there is not this Loop -> the current executed Brgemm should have `beta = 0`. - if (loop_id > 0) { - const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info(); - // Check the previous Loops - --loop_id; - while (loop_id >= 0) { - const auto& expanded_loop_info = loop_manager->get_loop_info(loop_id); - if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) - return 0; - if (expanded_loop_info->get_work_amount() > 0) { - // there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1` - return 1; - } - --loop_id; - } - } - return 0; -} + void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, const ov::snippets::lowered::LinearIRCPtr& linear_ir, BrgemmKernelConfig& config) const { - const auto& input_pds = expr->get_input_port_descriptors(); - const auto& output_pds = expr->get_output_port_descriptors(); - OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1, - "Invalid number of in/out port descriptors"); - - const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout()); - const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout()); - auto in0_subtensor = input_pds[0]->get_subtensor(); - auto in1_subtensor = input_pds[1]->get_subtensor(); - - // Need to update M, K, N - // 1. If the original value in subtensor is `FULL_DIM`, it means that - // Brgemm block should process full tensor by this dim -> take dimension from shape - // 2. Otherwise, Brgemm block processes part of the tensor by this dim - // (there is blocking by this dimension) -> take from Loop increment - - auto M = *++in0_subtensor.rbegin(); - auto K = *in0_subtensor.rbegin(); - auto N = *in1_subtensor.rbegin(); - - size_t loop_idx = 0; - const auto& loop_ids = expr->get_loop_ids(); - const auto& loop_manager = linear_ir->get_loop_manager(); - auto get_loop_info = [&](){ - OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); - return loop_manager->get_loop_info(loop_ids[loop_idx++]); - }; - - /* ------- Dimension M ----------*/ - if (ov::snippets::utils::is_full_dim_value(M)) { - M = *++in0_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false` - // to avoid extra checks, we validate only first input port - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 1; }; - OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && - out_ports.size() == 1 && check_port(out_ports.back()), - "Incorrect Loop by Brgemm dimension M"); - M = current_expanded_loop_info->get_increment(); - input_pds[0]->set_subtensor_dim(1, M); - output_pds[0]->set_subtensor_dim(1, M); - } - - /* ------- Dimension N ----------*/ - if (ov::snippets::utils::is_full_dim_value(N)) { - N = *in1_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 0; }; - OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_incremented && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && - out_ports.size() == 1 && check_port(out_ports.back()), - "Incorrect Loop by Brgemm dimension N"); - N = current_expanded_loop_info->get_increment(); - input_pds[1]->set_subtensor_dim(0, N); - output_pds[0]->set_subtensor_dim(0, N); - } - - /* ------- Dimension K ----------*/ - // 1. If Brgemm block processes full dimension K -> `beta = 0` - // 2. If Brgemm block processes part of the dimension K (there is blocking), need to find - // the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0). - // First of them will have `beta = 0`, other - `beta = 1` - float beta = 0; - if (ov::snippets::utils::is_full_dim_value(K)) { - K = *in0_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && - out_ports.size() == 1 && !out_ports.front().is_incremented, - "Incorrect Loop by Brgemm dimension K"); - K = current_expanded_loop_info->get_increment(); - input_pds[0]->set_subtensor_dim(0, K); - input_pds[1]->set_subtensor_dim(1, K); - if (K > 0) - beta = get_beta(loop_manager, static_cast(loop_ids.back()), current_expanded_loop_info); - } - - const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); - const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); - auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); - const auto& brgemm_node = as_type_ptr(expr->get_node()); - OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config"); - // In case of data repacking LDB is chosen in accordance with repacking buffer size - if (with_repacking(brgemm_node->get_type())) - LDB = brgemm_utils::repacking::compute_LDB(LDB, brgemm_node->get_input_element_type(1)); - - config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); + return BrgemmBaseKernelExecutor::update_config(expr, linear_ir, config); } void BrgemmKernelExecutor::execute(const BrgemmKernelExecutor* executor, call_args* args) { + OV_CPU_JIT_EMITTER_ASSERT(executor, "has nullptr executor"); auto kernel = executor->get_kernel(); const auto& config = static_cast(executor->get_config()); OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr compiler kernel or invalid config"); - const auto tile_config = args->amx_tile_config; - if (config.is_with_amx() && tile_config && !config.compatible(tile_config)) { - *tile_config = static_cast(config); - cpu::x64::amx_tile_configure(kernel->palette); - } - - cpu::x64::brgemm_kernel_params_t brgemm_p; // Note: compensations should be applied only once, so we do it only on the first iteration, when beta == 0 - size_t is_with_comp = config.get_beta() == 0 && config.is_with_comp(); - - brgemm_p.batch = nullptr; // default value - brgemm_p.ptr_A = args->A; - brgemm_p.ptr_B = args->B; - brgemm_p.ptr_C = args->C; - brgemm_p.ptr_D = args->C; - brgemm_p.ptr_buf = args->scratch; - brgemm_p.ptr_bias = nullptr; - brgemm_p.do_post_ops = is_with_comp; - brgemm_p.do_apply_comp = is_with_comp; - brgemm_p.skip_accm = 0; - brgemm_p.BS = 1; // default value - OV_CPU_JIT_EMITTER_ASSERT(kernel->compiled_kernel, "has nullptr kernel"); - (*kernel->compiled_kernel)(&brgemm_p); + const auto is_with_comp = config.get_beta() == 0 && config.is_with_comp(); + execute_brgemm_kernel(kernel->brgemm_kernel, args->A, args->B, args->C, args->scratch, is_with_comp); } #ifdef SNIPPETS_DEBUG_CAPS -BrgemmKernelReferenceExecutor::BrgemmKernelReferenceExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) : - BrgemmKernelExecutor(std::move(kernel_cache), std::move(config)) { -} +BrgemmKernelReferenceExecutor::BrgemmKernelReferenceExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, + BrgemmKernelConfig config) + : BrgemmKernelExecutor(std::move(kernel_cache), std::move(config)) {} std::shared_ptr BrgemmKernelReferenceExecutor::compile_kernel(const BrgemmKernelConfig& c) const { const auto& res = std::make_shared(); - res->compiled_kernel.reset(new brgemm_ref_kernel(c)); + res->brgemm_kernel.reset(new brgemm_ref_kernel(c)); return res; } brgemm_ref_kernel::brgemm_ref_kernel(BrgemmKernelConfig c) : m_config(std::move(c)) { - OV_CPU_JIT_EMITTER_ASSERT(!m_config.is_with_comp() && !m_config.is_with_amx(), - "brgemm_ref_kernel doesn't currently support compensations or amx"); - OV_CPU_JIT_EMITTER_ASSERT(m_config.get_dt_in0() == m_config.get_dt_in1() && - m_config.get_dt_in0() == dnnl_data_type_t::dnnl_f32, - "brgemm_ref_kernel currently supports only fp32 inputs"); + OV_CPU_JIT_EMITTER_ASSERT(!m_config.is_with_comp(), "brgemm_ref_kernel doesn't currently support compensations"); + OV_CPU_JIT_EMITTER_ASSERT( + m_config.get_dt_in0() == m_config.get_dt_in1() && m_config.get_dt_in0() == dnnl_data_type_t::dnnl_f32, + "brgemm_ref_kernel currently supports only fp32 inputs"); } void brgemm_ref_kernel::operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t* args) const { @@ -357,5 +127,5 @@ void brgemm_ref_kernel::operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t* } #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp index 2549580c1a176c..9cc17049c4d3ae 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp @@ -1,96 +1,72 @@ -// Copyright (C) 2020-2023 Intel Corporation +// Copyright (C) 2020-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" -#include "emitters/snippets/cpu_kernel_executor_table.hpp" -#include - -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/loop_info.hpp" +#include "brgemm_base.hpp" namespace ov { namespace intel_cpu { -struct BrgemmKernelConfig : public snippets::KernelExecutorBase::GenericConfig { + +struct BrgemmKernelConfig : public BrgemmBaseKernelConfig { public: - BrgemmKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + BrgemmKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); BrgemmKernelConfig() = delete; - bool is_completed() const override; - size_t hash() const override { return m_hash; } - bool operator==(const BrgemmKernelConfig& rhs) const; - bool operator!=(const BrgemmKernelConfig& rhs) const {return !(*this == rhs);} - std::unique_ptr get_clone_ptr() const override { - return std::unique_ptr( new BrgemmKernelConfig(*this)); - } - void update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta); - bool is_empty() const; - - dnnl_data_type_t get_dt_in0() const { return m_static_params->dt_in0; } - dnnl_data_type_t get_dt_in1() const { return m_static_params->dt_in1; } - - dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { return m_static_params->isa; } - bool is_with_amx() const {return m_static_params->is_with_amx; } - bool is_with_comp() const { return m_static_params->is_with_comp; } - float get_beta() const { return m_beta; } - - dnnl_dim_t get_M() const { return m_M; } - dnnl_dim_t get_N() const { return m_N; } - dnnl_dim_t get_K() const { return m_K; } - dnnl_dim_t get_LDA() const { return m_LDA; } - dnnl_dim_t get_LDB() const { return m_LDB; } - dnnl_dim_t get_LDC() const { return m_LDC; } - - explicit operator amx_tile_config_t() const; - inline bool compatible(amx_tile_config_t* rhs) const { - return rhs && rhs->M == m_M && rhs->N == m_N && rhs->K == m_K; + std::unique_ptr get_clone_ptr() const override { + return std::unique_ptr(new BrgemmKernelConfig(*this)); } -#ifdef SNIPPETS_DEBUG_CAPS - std::string to_string() const override; -#endif + bool is_with_comp() const { + return m_static_params->is_with_comp; + } private: - struct StaticParams { - StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); - const dnnl_data_type_t dt_in0 {dnnl_f32}, dt_in1 {dnnl_f32}; - const bool is_with_amx {false}; - const bool is_with_comp {false}; - const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::isa_undef}; - const size_t hash {0}; + struct StaticParams : StaticBaseParams { + StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + bool is_with_comp, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + + const bool is_with_comp{false}; + bool operator==(const StaticParams& rhs) const; - bool operator!=(const StaticParams& rhs) const { return !(*this == rhs); } + bool operator!=(const StaticParams& rhs) const { + return !(*this == rhs); + } #ifdef SNIPPETS_DEBUG_CAPS std::string to_string() const; #endif + private: + static size_t compute_hash(bool is_with_comp); }; - size_t compute_hash() const; - std::shared_ptr m_static_params; - dnnl_dim_t m_M {0}, m_N {0}, m_K {0}, m_LDA {0}, m_LDB {0}, m_LDC {0}; - float m_beta {0}; - size_t m_hash {SIZE_MAX}; + + std::shared_ptr get_static_params() const override { + return m_static_params; + } + + std::shared_ptr m_static_params{nullptr}; }; +// The `update_kernel` method verifies that a compiled kernel is not nullptr. +// However, the compiled kernel might be empty in cases if nothing is to be compiled (`Config.is_empty() == true`). +// To cover this case, we wrap the `brgemm_kernel_t` in the separate structure which may contain empty `brgemm_kernel_t` struct BrgemmCompiledKernel { - std::unique_ptr compiled_kernel = nullptr; - // Note: Palette is treated as a part of a kernel because it is initialized during the kernel compilation stage. - // Each kernel need to store the pallet it was compiled with. - char palette[64] = {}; + std::shared_ptr brgemm_kernel = nullptr; }; -class BrgemmKernelExecutor : public CPUKernelExecutor { +class BrgemmKernelExecutor : public BrgemmBaseKernelExecutor, + public CPUKernelExecutor { public: struct call_args { const void* A = nullptr; const void* B = nullptr; void* C = nullptr; void* scratch = nullptr; - amx_tile_config_t* amx_tile_config = nullptr; }; BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config); @@ -99,12 +75,10 @@ class BrgemmKernelExecutor : public CPUKernelExecutor compile_kernel(const BrgemmKernelConfig& c) const override; + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, const ov::snippets::lowered::LinearIRCPtr& linear_ir, BrgemmKernelConfig& config) const override; - - static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, - const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info); }; #define GET_OFF_BRGEMM_ARGS(field) offsetof(BrgemmKernelExecutor::call_args, field) @@ -113,20 +87,25 @@ class BrgemmKernelReferenceExecutor : public BrgemmKernelExecutor { public: BrgemmKernelReferenceExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config); using BrgemmKernelExecutor::execute; + protected: std::shared_ptr compile_kernel(const BrgemmKernelConfig& c) const override; }; + struct brgemm_ref_kernel : public dnnl::impl::cpu::x64::brgemm_kernel_t { brgemm_ref_kernel(BrgemmKernelConfig c); - void operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t *) const override; - dnnl_status_t create_kernel() override { return dnnl_status_t::dnnl_success; } - const dnnl::impl::cpu::x64::jit_generator *get_jit_generator() const override { + void operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t*) const override; + dnnl_status_t create_kernel() override { + return dnnl_status_t::dnnl_success; + } + const dnnl::impl::cpu::x64::jit_generator* get_jit_generator() const override { OV_CPU_JIT_EMITTER_THROW("get_jit_generator should not be called for reference kernel"); return nullptr; } + private: BrgemmKernelConfig m_config; }; #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp new file mode 100644 index 00000000000000..12c52d43b2c4b8 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp @@ -0,0 +1,317 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_amx.hpp" + +#include + +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +#define INNER_K_BLK(dtype) static_cast((brgemm_utils::repacking::compute_inner_k_block(in0_dtype))) +#define VNNI_FACTOR(dtype) static_cast((brgemm_utils::compute_vnni_factor(in0_dtype))) +#define EQ(X) X == rhs.X +#define HASH(X) seed = hash_combine(seed, X) + +using namespace Xbyak; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; + +namespace ov { +namespace intel_cpu { + +BrgemmAMXKernelConfig::BrgemmAMXKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : BrgemmBaseKernelConfig(), + m_static_params(std::make_shared(in0_dtype, in1_dtype, primitive_isa)) { + m_hash = compute_hash(); +} + +BrgemmAMXKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : StaticBaseParams(in0_dtype, + in1_dtype, + primitive_isa, + compute_hash(INNER_K_BLK(in0_dtype), VNNI_FACTOR(in0_dtype))), + inner_k_blk(INNER_K_BLK(in0_dtype)), + vnni_factor(VNNI_FACTOR(in0_dtype)) {} + +bool BrgemmAMXKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { + return StaticBaseParams::operator==(rhs) && EQ(inner_k_blk) && EQ(vnni_factor); +} + +size_t BrgemmAMXKernelConfig::StaticParams::compute_hash(dnnl_dim_t inner_k_blk, dnnl_dim_t vnni_factor) { + size_t seed = 0; + HASH(inner_k_blk); + HASH(vnni_factor); + return seed; +} + +bool BrgemmAMXKernelConfig::need_copy_a(dnnl_dim_t K) const { + return K % get_vnni_factor() > 0; +} + +#ifdef SNIPPETS_DEBUG_CAPS +std::string BrgemmAMXKernelConfig::StaticParams::to_string() const { + std::stringstream ss; + ss << StaticBaseParams::to_string(); + ss << "inner_k_blk = " << inner_k_blk << "\n"; + ss << "vnni_factor = " << vnni_factor << "\n"; + return ss.str(); +} +#endif + +BrgemmAMXKernelExecutor::BrgemmAMXKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, + BrgemmAMXKernelConfig config) + : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) {} + +namespace { +struct BrgemmCopyAKey { + BrgemmCopyAKey(cpu_isa_t isa, + dnnl_data_type_t dt, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t K_tail, + dnnl_dim_t src_stride, + dnnl_dim_t LDA) + : isa(isa), + dt(dt), + K{K}, + K_blk{K_blk}, + K_tail{K_tail}, + src_stride{src_stride}, + LDA{LDA} {} + + size_t hash() const { + size_t seed = 0; + HASH(isa); + HASH(dt); + HASH(K); + HASH(K_blk); + HASH(K_tail); + HASH(src_stride); + HASH(LDA); + return seed; + } + bool operator==(const BrgemmCopyAKey& rhs) const { + return EQ(isa) && EQ(dt) && EQ(K) && EQ(K_blk) && EQ(K_tail) && EQ(src_stride) && EQ(LDA); + } + + cpu_isa_t isa{cpu_isa_t::isa_undef}; + dnnl_data_type_t dt{dnnl_data_type_t::dnnl_data_type_undef}; + dnnl_dim_t K{0}, K_blk{0}, K_tail{0}, src_stride{0}, LDA{0}; +}; +} // namespace + +std::shared_ptr BrgemmAMXKernelExecutor::compile_kernel( + const BrgemmAMXKernelConfig& config) const { + std::shared_ptr compiled_kernel = std::make_shared(); + + // Brgemm is not executable - nothing to compile + if (config.is_empty()) + return compiled_kernel; + + const auto& cache = m_kernel_cache.lock(); + OPENVINO_ASSERT(cache, "Invalid kernel cache pointer in BrgemmAMXKernelExecutor::compile_kernel()"); + + auto brgemm_key = [&config](dnnl_dim_t K, dnnl_dim_t LDA, float beta) { + auto key = config; + key.update(config.get_M(), config.get_N(), K, LDA, config.get_LDB(), config.get_LDC(), beta); + return key; + }; + + auto brgemm_builder = [](const BrgemmAMXKernelConfig& k) { + std::shared_ptr ker = + std::make_shared(); + create_brgemm_kernel(ker->brgemm_kernel, + k.get_dt_in0(), + k.get_dt_in1(), + k.get_isa(), + k.get_M(), + k.get_N(), + k.get_K(), + k.get_LDA(), + k.get_LDB(), + k.get_LDC(), + k.get_beta(), + true, + ker->palette); + return ker; + }; + + auto brgemm_copy_a_builder = [](const BrgemmCopyAKey& k) { + std::shared_ptr ker{nullptr}; + create_brgemm_copy_a_kernel(ker, k.isa, k.dt, k.K, k.K_blk, k.K_tail, k.src_stride, k.LDA); + return ker; + }; + + auto K_tail = config.get_K() % config.get_inner_K_blk(); + auto K_body = config.get_K() - K_tail; + + float beta = config.get_beta(); + + // Brgemm Kernel for K_body + if (K_body != 0) { + const auto result = cache->getOrCreate(brgemm_key(K_body, config.get_LDA(), beta), brgemm_builder); + compiled_kernel->K_body_kernel = result.first; + beta = 1; + } + + // Brgemm Kernel for K_tail with BrgemmCopyA if needed + if (K_tail != 0) { + auto LDA = config.get_LDA(); + if (config.need_copy_a(K_tail)) { + const auto copy_A_src_stride = LDA * dnnl_data_type_size(config.get_dt_in0()); + K_tail = ov::snippets::utils::rnd_up(K_tail, config.get_vnni_factor()); + LDA = K_tail; + + const auto key = BrgemmCopyAKey(config.get_isa(), + config.get_dt_in0(), + config.get_K(), + config.get_inner_K_blk(), + K_tail, + copy_A_src_stride, + LDA); + const auto result = cache->getOrCreate(key, brgemm_copy_a_builder); + compiled_kernel->brgemm_copy_a_kernel = result.first; + } + + const auto result = cache->getOrCreate(brgemm_key(K_tail, LDA, beta), brgemm_builder); + compiled_kernel->K_tail_kernel = result.first; + } + + return compiled_kernel; +} + +void BrgemmAMXKernelExecutor::create_brgemm_copy_a_kernel( + std::shared_ptr& kernel, + dnnl::impl::cpu::x64::cpu_isa_t isa, + dnnl_data_type_t dt, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t K_tail, + dnnl_dim_t src_stride, + dnnl_dim_t LDA) { + matmul::brgemm_matmul_conf_t conf_; + conf_.src_tag = dnnl_abcd; // unused + conf_.K = K; + conf_.K_tail = K_tail; + conf_.K_blk = K_blk; + conf_.use_buffer_a_tail_only = false; + conf_.LDA = LDA; + conf_.has_zero_point_b = false; + conf_.s8s8_compensation_required = false; + conf_.wei_zp_type = dnnl::impl::cpu::x64::none; + conf_.src_zp_type = dnnl::impl::cpu::x64::none; + conf_.src_dt = dt; + conf_.copy_A_src_stride = src_stride; + conf_.a_dt_sz = dnnl_data_type_size(conf_.src_dt); + // copied A has the same precision of original + conf_.tr_a_dt_sz = dnnl_data_type_size(conf_.src_dt); + conf_.transposed_A = false; + conf_.isa = isa; + + std::unique_ptr brgemm_matmul_copy_a = nullptr; + OV_CPU_JIT_EMITTER_ASSERT(create_brgemm_matmul_copy_a(brgemm_matmul_copy_a, &conf_) == dnnl_success, + "Cannot create brgemm copy a kernel due to invalid params"); + kernel = std::move(brgemm_matmul_copy_a); +} + +void BrgemmAMXKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmAMXKernelConfig& config) const { + return BrgemmBaseKernelExecutor::update_config(expr, linear_ir, config); +} + +void BrgemmAMXKernelExecutor::configure_tiles_if_needed(amx_tile_config_t* config, + const char* palette, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K) { + auto compatible = [&](amx_tile_config_t* rhs) { + return rhs && rhs->M == M && rhs->N == N && rhs->K == K; + }; + if (config && !compatible(config)) { + config->M = M; + config->N = N; + config->K = K; + cpu::x64::amx_tile_configure(palette); + } +} + +void BrgemmAMXKernelExecutor::execute_brgemm_copy_a_kernel( + const std::shared_ptr& kernel, + const void* src, + const void* tr_src, + dnnl_dim_t M, + dnnl_dim_t K) { + auto ctx = matmul::jit_brgemm_matmul_copy_a_t::ctx_t(); + + ctx.current_M_blk = M; + ctx.zp_b_compensation_buffer_ptr = nullptr; + ctx.zp_a_compensation_result_ptr = nullptr; + ctx.zp_b_neg_value_ptr = nullptr; + ctx.zp_ab_comp_ptr = nullptr; + ctx.src = src; + ctx.tr_src = tr_src; + ctx.current_K_start = 0; + ctx.current_K_blk = K; + + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr brgemm_copy_a_kernel"); + (*kernel)(&ctx); +} + +void BrgemmAMXKernelExecutor::execute(const BrgemmAMXKernelExecutor* executor, call_args* args) { + OV_CPU_JIT_EMITTER_ASSERT(executor, "has nullptr executor"); + auto kernel = executor->get_kernel(); + const auto& config = static_cast(executor->get_config()); + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr compiler kernel or invalid config"); + + const auto* src_ptr = args->A; + const auto* wei_ptr = args->B; + auto* scratch = args->scratch; + + const auto K_tail = config.get_K() % config.get_inner_K_blk(); + const auto K_body = config.get_K() - K_tail; + + if (K_body != 0) { + const auto& K_body_kernel = kernel->K_body_kernel; + configure_tiles_if_needed(args->amx_tile_config, + K_body_kernel->palette, + config.get_M(), + config.get_N(), + K_body); + execute_brgemm_kernel(K_body_kernel->brgemm_kernel, src_ptr, wei_ptr, args->C, scratch, false); + + src_ptr = src_ptr + K_body * dnnl_data_type_size(config.get_dt_in0()); + wei_ptr = wei_ptr + (K_body * config.get_LDB()) * dnnl_data_type_size(config.get_dt_in1()); + } + + if (K_tail != 0) { + if (config.need_copy_a(K_tail)) { + auto* tr_src = scratch + BrgemmCPU::SCRATCH_BYTE_SIZE; + + execute_brgemm_copy_a_kernel(kernel->brgemm_copy_a_kernel, src_ptr, tr_src, config.get_M(), K_tail); + src_ptr = tr_src; + } + + const auto& K_tail_kernel = kernel->K_tail_kernel; + configure_tiles_if_needed(args->amx_tile_config, + K_tail_kernel->palette, + config.get_M(), + config.get_N(), + K_tail); + execute_brgemm_kernel(K_tail_kernel->brgemm_kernel, src_ptr, wei_ptr, args->C, scratch, false); + } +} + +#undef INNER_K_BLK +#undef VNNI_FACTOR +#undef EQ +#undef HASH + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp new file mode 100644 index 00000000000000..733295ec995583 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp @@ -0,0 +1,126 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "brgemm_base.hpp" +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" + +namespace ov { +namespace intel_cpu { + +struct BrgemmAMXKernelConfig : public BrgemmBaseKernelConfig { +public: + BrgemmAMXKernelConfig(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + BrgemmAMXKernelConfig() = delete; + + std::unique_ptr get_clone_ptr() const override { + return std::unique_ptr(new BrgemmAMXKernelConfig(*this)); + } + + dnnl_dim_t get_inner_K_blk() const { + return m_static_params->inner_k_blk; + } + dnnl_dim_t get_vnni_factor() const { + return m_static_params->vnni_factor; + } + + bool need_copy_a(dnnl_dim_t K) const; + +private: + struct StaticParams : StaticBaseParams { + StaticParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + + const dnnl_dim_t inner_k_blk{0}; + const dnnl_dim_t vnni_factor{0}; + + bool operator==(const StaticParams& rhs) const; + bool operator!=(const StaticParams& rhs) const { + return !(*this == rhs); + } +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const; +#endif + private: + static size_t compute_hash(dnnl_dim_t inner_k_blk, dnnl_dim_t vnni_factor); + }; + + std::shared_ptr get_static_params() const override { + return m_static_params; + } + + std::shared_ptr m_static_params{nullptr}; +}; + +struct BrgemmAMXCompiledKernel { + struct BrgemmKernel { + std::shared_ptr brgemm_kernel{nullptr}; + // Note: Palette is treated as a part of a kernel because it is initialized during the kernel compilation stage. + // Each kernel need to store the pallet it was compiled with. + char palette[64] = {}; + }; + + std::shared_ptr K_body_kernel{nullptr}; + std::shared_ptr K_tail_kernel{nullptr}; + std::shared_ptr brgemm_copy_a_kernel{nullptr}; +}; + +class BrgemmAMXKernelExecutor : public BrgemmBaseKernelExecutor, + public CPUKernelExecutor { +public: + struct call_args { + const uint8_t* A = nullptr; + const uint8_t* B = nullptr; + void* C = nullptr; + uint8_t* scratch = nullptr; + amx_tile_config_t* amx_tile_config = nullptr; + }; + BrgemmAMXKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmAMXKernelConfig config); + + /** Function that will be called in runtime to execute the kernel */ + static void execute(const BrgemmAMXKernelExecutor* executor, call_args* args); + +protected: + std::shared_ptr compile_kernel(const BrgemmAMXKernelConfig& c) const override; + + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmAMXKernelConfig& config) const override; + + static void configure_tiles_if_needed(amx_tile_config_t* config, + const char* palette, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K); + + static void create_brgemm_copy_a_kernel( + std::shared_ptr& kernel, + dnnl::impl::cpu::x64::cpu_isa_t isa, + dnnl_data_type_t dt, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t K_tail, + dnnl_dim_t src_stride, + dnnl_dim_t LDA); + + static void execute_brgemm_copy_a_kernel( + const std::shared_ptr& kernel, + const void* src, + const void* tr_src, + dnnl_dim_t M, + dnnl_dim_t K); +}; +#define GET_OFF_BRGEMM_AMX_ARGS(field) offsetof(BrgemmAMXKernelExecutor::call_args, field) + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp new file mode 100644 index 00000000000000..008237780de3f6 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp @@ -0,0 +1,342 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_base.hpp" + +#include "common/utils.hpp" +#include "dnnl_extension_utils.h" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +#define DIM_CAST(X) static_cast(X) +#define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) +#define PRINT(X) ss << #X << " = " << X << "\n" +#define EQ(X) X == rhs.X +#define HASH(X) seed = hash_combine(seed, X) + +using namespace Xbyak; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; + +namespace ov { +namespace intel_cpu { + +bool BrgemmBaseKernelConfig::is_completed() const { + return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty(); +} + +bool BrgemmBaseKernelConfig::is_empty() const { + return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta); +} + +bool BrgemmBaseKernelConfig::operator==(const BrgemmBaseKernelConfig& rhs) const { + return EQ(m_hash) && EQ(m_beta) && EQ(m_M) && EQ(m_N) && EQ(m_K) && EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC) && + (EQ(get_static_params()) || *get_static_params() == *(rhs.get_static_params())); +} + +void BrgemmBaseKernelConfig::update(dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K, + dnnl_dim_t LDA, + dnnl_dim_t LDB, + dnnl_dim_t LDC, + float beta) { + // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) + // To process this case, we have to make this Config as empty (nullify runtime parameters) + if (utils::one_of(0, M, N, K)) { + m_M = 0; + m_N = 0; + m_K = 0; + m_LDA = 0; + m_LDB = 0; + m_LDC = 0; + m_beta = 0; + } else { + m_M = M; + m_N = N; + m_K = K; + m_LDA = LDA; + m_LDB = LDB; + m_LDC = LDC; + m_beta = beta; + } + m_hash = compute_hash(); +} + +size_t BrgemmBaseKernelConfig::compute_hash() const { + size_t seed = get_static_params()->hash(); + HASH(m_M); + HASH(m_N); + HASH(m_K); + HASH(m_LDA); + HASH(m_LDB); + HASH(m_LDC); + HASH(m_beta); + return seed; +} + +BrgemmBaseKernelConfig::StaticBaseParams::StaticBaseParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + cpu_isa_t primitive_isa, + size_t hash_seed) + : dt_in0(DTYPE_CAST(in0_dtype)), + dt_in1(DTYPE_CAST(in1_dtype)), + isa(primitive_isa), + m_hash(compute_hash(hash_seed, dt_in0, dt_in1, isa)) {} + +bool BrgemmBaseKernelConfig::StaticBaseParams::operator==(const StaticBaseParams& rhs) const { + return EQ(hash()) && EQ(dt_in0) && EQ(dt_in1) && EQ(isa); +} + +size_t BrgemmBaseKernelConfig::StaticBaseParams::compute_hash(size_t hash_seed, + dnnl_data_type_t dt_in0, + dnnl_data_type_t dt_in1, + cpu_isa_t isa) { + size_t seed = hash_seed; + HASH(dt_in0); + HASH(dt_in1); + HASH(isa); + return seed; +} + +#ifdef SNIPPETS_DEBUG_CAPS +std::string BrgemmBaseKernelConfig::StaticBaseParams::to_string() const { + std::stringstream ss; + PRINT(dt_in0); + PRINT(dt_in1); + PRINT(isa); + return ss.str(); +} + +std::string BrgemmBaseKernelConfig::to_string() const { + std::stringstream ss; + ss << get_static_params()->to_string() << "\n"; + PRINT(m_M); + PRINT(m_N); + PRINT(m_K); + PRINT(m_LDA); + PRINT(m_LDB); + PRINT(m_LDC); + PRINT(m_beta); + return ss.str(); +} +#endif + +float BrgemmBaseKernelExecutor::get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, + int loop_id, + const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) { + // Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop. + // Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass). + // It means that previous executed Loops have Loop ID less the current Loop ID. + // - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have + // `beta = 1`. + // - If there is not this Loop -> the current executed Brgemm should have `beta = 0`. + if (loop_id > 0) { + const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info(); + // Check the previous Loops + --loop_id; + while (loop_id >= 0) { + const auto& expanded_loop_info = + loop_manager->get_loop_info(loop_id); + if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) + return 0; + if (expanded_loop_info->get_work_amount() > 0) { + // there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1` + return 1; + } + --loop_id; + } + } + return 0; +} + +void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmBaseKernelConfig& config) { + const auto& input_pds = expr->get_input_port_descriptors(); + const auto& output_pds = expr->get_output_port_descriptors(); + OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1, + "Invalid number of in/out port descriptors"); + + const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout()); + const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout()); + auto in0_subtensor = input_pds[0]->get_subtensor(); + auto in1_subtensor = input_pds[1]->get_subtensor(); + + // Need to update M, K, N + // 1. If the original value in subtensor is `FULL_DIM`, it means that + // Brgemm block should process full tensor by this dim -> take dimension from shape + // 2. Otherwise, Brgemm block processes part of the tensor by this dim + // (there is blocking by this dimension) -> take from Loop increment + + auto M = *++in0_subtensor.rbegin(); + auto K = *in0_subtensor.rbegin(); + auto N = *in1_subtensor.rbegin(); + + size_t loop_idx = 0; + const auto& loop_ids = expr->get_loop_ids(); + const auto& loop_manager = linear_ir->get_loop_manager(); + auto get_loop_info = [&]() { + OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); + return loop_manager->get_loop_info(loop_ids[loop_idx++]); + }; + + /* ------- Dimension M ----------*/ + if (ov::snippets::utils::is_full_dim_value(M)) { + M = *++in0_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false` + // to avoid extra checks, we validate only first input port + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { + return p.dim_idx == 1; + }; + OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && + out_ports.size() == 1 && check_port(out_ports.back()), + "Incorrect Loop by Brgemm dimension M"); + M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[0]->set_subtensor_dim(1, M); + output_pds[0]->set_subtensor_dim(1, M); + } + + /* ------- Dimension N ----------*/ + if (ov::snippets::utils::is_full_dim_value(N)) { + N = *in1_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { + return p.dim_idx == 0; + }; + OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_incremented && + std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && out_ports.size() == 1 && + check_port(out_ports.back()), + "Incorrect Loop by Brgemm dimension N"); + N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[1]->set_subtensor_dim(0, N); + output_pds[0]->set_subtensor_dim(0, N); + } + + /* ------- Dimension K ----------*/ + // 1. If Brgemm block processes full dimension K -> `beta = 0` + // 2. If Brgemm block processes part of the dimension K (there is blocking), need to find + // the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0). + // First of them will have `beta = 0`, other - `beta = 1` + float beta = 0; + if (ov::snippets::utils::is_full_dim_value(K)) { + K = *in0_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && + out_ports.size() == 1 && !out_ports.front().is_incremented, + "Incorrect Loop by Brgemm dimension K"); + K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[0]->set_subtensor_dim(0, K); + input_pds[1]->set_subtensor_dim(1, K); + if (K > 0) + beta = get_beta(loop_manager, static_cast(loop_ids.back()), current_expanded_loop_info); + } + + const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); + const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); + auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); + + const auto& brgemm_node = as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config"); + // In case of data repacking LDB is chosen in accordance with repacking buffer size + if (with_repacking(brgemm_node->get_type())) + LDB = DIM_CAST(brgemm_utils::repacking::compute_LDB(LDB, brgemm_node->get_input_element_type(1))); + + config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); +} + +void BrgemmBaseKernelExecutor::create_brgemm_kernel(std::shared_ptr& kernel, + dnnl_data_type_t dt0, + dnnl_data_type_t dt1, + cpu_isa_t isa, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K, + dnnl_dim_t LDA, + dnnl_dim_t LDB, + dnnl_dim_t LDC, + float beta, + bool with_amx, + char* palette) { + cpu::x64::brgemm_desc_t desc; + OV_CPU_JIT_EMITTER_ASSERT(brgemm_desc_init(&desc, + isa, + cpu::x64::brgemm_strd, + dt0, + dt1, + false, + false, + cpu::x64::brgemm_row_major, + 1.f, + beta, + LDA, + LDB, + LDC, + M, + N, + K, + nullptr) == dnnl_success, + "Cannot initialize brgemm descriptor due to invalid params"); + + if (with_amx) { + OV_CPU_JIT_EMITTER_ASSERT(palette && brgemm_init_tiles(desc, palette) == dnnl_success, + "Cannot initialize brgemm tiles due to invalid params"); + } + + cpu::x64::brgemm_kernel_t* kernel_ = nullptr; + OV_CPU_JIT_EMITTER_ASSERT(brgemm_kernel_create(&kernel_, desc) == dnnl_success, + "Cannot create brgemm kernel due to invalid params"); + kernel = std::unique_ptr(kernel_); +} + +void BrgemmBaseKernelExecutor::execute_brgemm_kernel( + const std::shared_ptr& kernel, + const void* src, + const void* wei, + void* dst, + void* scratch, + bool with_comp) { + cpu::x64::brgemm_kernel_params_t brgemm_p; + brgemm_p.batch = nullptr; // default value + brgemm_p.ptr_A = src; + brgemm_p.ptr_B = wei; + brgemm_p.ptr_C = dst; + brgemm_p.ptr_D = dst; + brgemm_p.ptr_buf = scratch; + brgemm_p.ptr_bias = nullptr; + brgemm_p.do_post_ops = with_comp; + brgemm_p.do_apply_comp = with_comp; + brgemm_p.skip_accm = 0; + brgemm_p.BS = 1; // default value + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr Brgemm kernel"); + (*kernel)(&brgemm_p); +} + +#undef DIM_CAST +#undef DTYPE_CAST +#undef PRINT +#undef EQ +#undef HASH + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp new file mode 100644 index 00000000000000..674ea42522230b --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp @@ -0,0 +1,150 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "cpu/x64/cpu_isa_traits.hpp" +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "openvino/core/type/element_type.hpp" +#include "snippets/lowered/loop_info.hpp" +#include "snippets/lowered/loop_manager.hpp" + +namespace ov { +namespace intel_cpu { + +struct BrgemmBaseKernelConfig : public snippets::KernelExecutorBase::GenericConfig { +public: + BrgemmBaseKernelConfig() = default; + + bool is_completed() const override; + size_t hash() const override { + return m_hash; + } + + bool is_empty() const; + void update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta); + + bool operator==(const BrgemmBaseKernelConfig& rhs) const; + bool operator!=(const BrgemmBaseKernelConfig& rhs) const { + return !(*this == rhs); + } + + dnnl_data_type_t get_dt_in0() const { + return get_static_params()->dt_in0; + } + dnnl_data_type_t get_dt_in1() const { + return get_static_params()->dt_in1; + } + + dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { + return get_static_params()->isa; + } + float get_beta() const { + return m_beta; + } + + dnnl_dim_t get_M() const { + return m_M; + } + dnnl_dim_t get_N() const { + return m_N; + } + dnnl_dim_t get_K() const { + return m_K; + } + + dnnl_dim_t get_LDA() const { + return m_LDA; + } + dnnl_dim_t get_LDB() const { + return m_LDB; + } + dnnl_dim_t get_LDC() const { + return m_LDC; + } + +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const override; +#endif + +protected: + struct StaticBaseParams { + StaticBaseParams(const element::Type& in0_dtype, + const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa, + size_t hash_seed); + virtual ~StaticBaseParams() = default; + + const dnnl_data_type_t dt_in0{dnnl_f32}, dt_in1{dnnl_f32}; + const dnnl::impl::cpu::x64::cpu_isa_t isa{dnnl::impl::cpu::x64::isa_undef}; + + size_t hash() const { + return m_hash; + } + + bool operator==(const StaticBaseParams& rhs) const; + bool operator!=(const StaticBaseParams& rhs) const { + return !(*this == rhs); + } +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const; +#endif + protected: + static size_t compute_hash(size_t hash_seed, + dnnl_data_type_t dt_in0, + dnnl_data_type_t dt_in1, + dnnl::impl::cpu::x64::cpu_isa_t isa); + + const size_t m_hash{0}; + }; + + virtual std::shared_ptr get_static_params() const = 0; + size_t compute_hash() const; + + dnnl_dim_t m_M{0}, m_N{0}, m_K{0}, m_LDA{0}, m_LDB{0}, m_LDC{0}; + float m_beta{0}; + size_t m_hash{SIZE_MAX}; +}; + +class BrgemmBaseKernelExecutor { +public: + virtual ~BrgemmBaseKernelExecutor() = default; + +protected: + static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, + int loop_id, + const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info); + + static void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmBaseKernelConfig& config); + + static void create_brgemm_kernel(std::shared_ptr& kernel, + dnnl_data_type_t dt0, + dnnl_data_type_t dt1, + dnnl::impl::cpu::x64::cpu_isa_t isa, + dnnl_dim_t M, + dnnl_dim_t N, + dnnl_dim_t K, + dnnl_dim_t LDA, + dnnl_dim_t LDB, + dnnl_dim_t LDC, + float beta, + bool with_amx = false, + char* palette = nullptr); + + static void execute_brgemm_kernel(const std::shared_ptr& kernel, + const void* src, + const void* wei, + void* dst, + void* scratch, + bool with_comp); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp index cc79458c7c4c64..dd216517ace12e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.cpp @@ -4,8 +4,8 @@ #include "brgemm_copy_b.hpp" -#include "snippets/lowered/loop_manager.hpp" #include "emitters/plugin/x64/utils.hpp" +#include "snippets/lowered/loop_manager.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" #define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) @@ -16,8 +16,12 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -BrgemmCopyBKernelConfig::BrgemmCopyBKernelConfig(const element::Type& src_dt, const element::Type& wei_dt, cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk) +BrgemmCopyBKernelConfig::BrgemmCopyBKernelConfig(const element::Type& src_dt, + const element::Type& wei_dt, + cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk) : m_static_params(std::make_shared(src_dt, wei_dt, isa, is_with_comp, is_transposed_B, wei_N_blk)) { m_hash = compute_hash(); } @@ -37,17 +41,28 @@ bool BrgemmCopyBKernelConfig::operator==(const BrgemmCopyBKernelConfig& rhs) con #undef EQ } -void BrgemmCopyBKernelConfig::update(dnnl_dim_t N, dnnl_dim_t N_blk, dnnl_dim_t K, dnnl_dim_t K_blk, dnnl_dim_t copy_B_wei_stride, dnnl_dim_t LDB) { - // If one of the dims is zero, it means that BrgemmCopyB won't be executed (in Loop with work_amount = 0, for example) - // To process this case, we have to make this Config as empty (nullify runtime parameters) +void BrgemmCopyBKernelConfig::update(dnnl_dim_t N, + dnnl_dim_t N_blk, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t copy_B_wei_stride, + dnnl_dim_t LDB) { + // If one of the dims is zero, it means that BrgemmCopyB won't be executed (in Loop with work_amount = 0, for + // example) To process this case, we have to make this Config as empty (nullify runtime parameters) if (utils::one_of(0, N, K)) { - m_N = 0; m_N_blk = 0; - m_K = 0; m_K_blk = 0; - m_copy_B_wei_stride = 0; m_LDB = 0; + m_N = 0; + m_N_blk = 0; + m_K = 0; + m_K_blk = 0; + m_copy_B_wei_stride = 0; + m_LDB = 0; } else { - m_N = N; m_N_blk = N_blk; - m_K = K; m_K_blk = K_blk; - m_copy_B_wei_stride = copy_B_wei_stride; m_LDB = LDB; + m_N = N; + m_N_blk = N_blk; + m_K = K; + m_K_blk = K_blk; + m_copy_B_wei_stride = copy_B_wei_stride; + m_LDB = LDB; } m_hash = compute_hash(); } @@ -55,59 +70,94 @@ void BrgemmCopyBKernelConfig::update(dnnl_dim_t N, dnnl_dim_t N_blk, dnnl_dim_t size_t BrgemmCopyBKernelConfig::compute_hash() const { size_t seed = m_static_params->hash; #define HASH(X) seed = hash_combine(seed, X) - HASH(m_N); HASH(m_N_blk); - HASH(m_K); HASH(m_K_blk); - HASH(m_copy_B_wei_stride); HASH(m_LDB); + HASH(m_N); + HASH(m_N_blk); + HASH(m_K); + HASH(m_K_blk); + HASH(m_copy_B_wei_stride); + HASH(m_LDB); #undef HASH return seed; } -BrgemmCopyBKernelConfig::StaticParams::StaticParams(const element::Type& src_type, const element::Type& wei_type, cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_n_blk) - : src_dt(DTYPE_CAST(src_type)), wei_dt(DTYPE_CAST(wei_type)), isa(isa), - is_with_comp(is_with_comp), is_transposed_B(is_transposed_B), wei_N_blk(wei_n_blk), +BrgemmCopyBKernelConfig::StaticParams::StaticParams(const element::Type& src_type, + const element::Type& wei_type, + cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_n_blk) + : src_dt(DTYPE_CAST(src_type)), + wei_dt(DTYPE_CAST(wei_type)), + isa(isa), + is_with_comp(is_with_comp), + is_transposed_B(is_transposed_B), + wei_N_blk(wei_n_blk), hash(init_hash(src_dt, wei_dt, isa, is_with_comp, is_transposed_B, wei_N_blk)) {} bool BrgemmCopyBKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { #define EQ(X) X == rhs.X - return EQ(hash) && EQ(src_dt) && EQ(wei_dt)&& EQ(isa) && EQ(is_with_comp) && EQ(is_transposed_B) && EQ(wei_N_blk); + return EQ(hash) && EQ(src_dt) && EQ(wei_dt) && EQ(isa) && EQ(is_with_comp) && EQ(is_transposed_B) && EQ(wei_N_blk); #undef EQ } -size_t BrgemmCopyBKernelConfig::StaticParams::init_hash(const dnnl_data_type_t& src_dt, const dnnl_data_type_t& wei_dt, cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk) { +size_t BrgemmCopyBKernelConfig::StaticParams::init_hash(const dnnl_data_type_t& src_dt, + const dnnl_data_type_t& wei_dt, + cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk) { size_t seed = 0; #define HASH(X) seed = hash_combine(seed, X) - HASH(src_dt); HASH(wei_dt); HASH(isa); - HASH(is_with_comp); HASH(is_transposed_B); HASH(wei_N_blk); + HASH(src_dt); + HASH(wei_dt); + HASH(isa); + HASH(is_with_comp); + HASH(is_transposed_B); + HASH(wei_N_blk); #undef HASH return seed; } #ifdef SNIPPETS_DEBUG_CAPS -#define PRINT(X) ss << #X << " = " << X << "\n" +# define PRINT(X) ss << #X << " = " << X << "\n" std::string BrgemmCopyBKernelConfig::to_string() const { std::stringstream ss; ss << m_static_params->to_string() << "\n"; - PRINT(m_hash); PRINT(m_N); PRINT(m_N_blk); - PRINT(m_K); PRINT(m_K_blk); PRINT(m_LDB); PRINT(m_copy_B_wei_stride); + PRINT(m_hash); + PRINT(m_N); + PRINT(m_N_blk); + PRINT(m_K); + PRINT(m_K_blk); + PRINT(m_LDB); + PRINT(m_copy_B_wei_stride); return ss.str(); } std::string BrgemmCopyBKernelConfig::StaticParams::to_string() const { std::stringstream ss; - PRINT(src_dt); PRINT(wei_dt); PRINT(isa); - PRINT(is_with_comp); PRINT(is_transposed_B); PRINT(wei_N_blk); + PRINT(src_dt); + PRINT(wei_dt); + PRINT(isa); + PRINT(is_with_comp); + PRINT(is_transposed_B); + PRINT(wei_N_blk); return ss.str(); } -#undef PRINT +# undef PRINT #endif BrgemmCopyBKernel::BrgemmCopyBKernel() : jit_generator(jit_name()), ker_(nullptr) {} BrgemmCopyBKernel::BrgemmCopyBKernel(const BrgemmCopyBKernelConfig& conf) - : jit_generator(jit_name()), is_with_comp(conf.is_with_comp()), is_transpose(conf.is_transposed_B()), - wei_data_size(dnnl_data_type_size(conf.get_wei_dt())), vnni_factor(data_type_vnni_granularity(conf.get_wei_dt())), - K(conf.get_K()), N_blk(conf.get_N_blk()), wei_N_blk(conf.get_wei_N_blk()), wei_N_tail(conf.get_wei_N_tail()), ker_(nullptr) { + : jit_generator(jit_name()), + is_with_comp(conf.is_with_comp()), + is_transpose(conf.is_transposed_B()), + wei_data_size(dnnl_data_type_size(conf.get_wei_dt())), + vnni_factor(data_type_vnni_granularity(conf.get_wei_dt())), + K(conf.get_K()), + N_blk(conf.get_N_blk()), + wei_N_blk(conf.get_wei_N_blk()), + wei_N_tail(conf.get_wei_N_tail()), + ker_(nullptr) { init_brgemm_copy_b_kernel(dnnl_brgemm_copy_b_kernel, conf); OV_CPU_JIT_EMITTER_ASSERT(dnnl_brgemm_copy_b_kernel, "Kernel is missed!"); } @@ -124,8 +174,9 @@ void BrgemmCopyBKernel::operator()(const call_args* args) const { ker_(args); } -void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(std::unique_ptr& kernel, - const BrgemmCopyBKernelConfig& conf) const { +void BrgemmCopyBKernel::init_brgemm_copy_b_kernel( + std::unique_ptr& kernel, + const BrgemmCopyBKernelConfig& conf) const { matmul::brgemm_matmul_conf_t brgCopyKernelConf; brgCopyKernelConf.src_dt = conf.get_src_dt(); brgCopyKernelConf.wei_dt = conf.get_wei_dt(); @@ -143,8 +194,10 @@ void BrgemmCopyBKernel::init_brgemm_copy_b_kernel(std::unique_ptr(brgCopyKernelConf.wei_dt)); - brgCopyKernelConf.tr_b_dt_sz = DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.wei_dt)); + brgCopyKernelConf.b_dt_sz = + DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.wei_dt)); + brgCopyKernelConf.tr_b_dt_sz = + DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.wei_dt)); brgCopyKernelConf.req_wei_vnni_downconvert = false; @@ -191,28 +244,35 @@ void BrgemmCopyBKernel::generate() { postamble(); } -void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp) { +void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, + size_t K, + size_t offset_in, + size_t offset_out, + size_t offset_comp) { EmitABIRegSpills spill(this); spill.preamble(); const auto add_offset = [&](Xbyak::Reg64 reg, size_t bytes_offset) { - if (bytes_offset) add(reg, bytes_offset); + if (bytes_offset) + add(reg, bytes_offset); }; // save function address in gpr to pass in call instruction - const auto& kernel_overload = static_cast(execute); + const auto& kernel_overload = static_cast< + void (*)(matmul::jit_brgemm_matmul_copy_b_t*, const void*, const void*, const void*, size_t, size_t)>(execute); mov(rbp, reinterpret_cast(kernel_overload)); mov(abi_param1, reinterpret_cast(dnnl_brgemm_copy_b_kernel.get())); - add_offset(src_reg, offset_in); // abi_param2 - add_offset(tr_src_reg, offset_out); // abi_param3 - if (is_with_comp) // abi_param4 + add_offset(src_reg, offset_in); // abi_param2 + add_offset(tr_src_reg, offset_out); // abi_param3 + if (is_with_comp) // abi_param4 add_offset(comp_reg, offset_comp); else mov(comp_reg, reinterpret_cast(nullptr)); #ifdef _WIN32 - // Note: ABI requires that the remaining parameters (except the first for) are pushed to the stack in right-to-left order + // Note: ABI requires that the remaining parameters (except the first for) are pushed to the stack in right-to-left + // order // Shadow space will be allocated inside internal_call_rsp_align() push(K); push(N); @@ -233,7 +293,12 @@ void BrgemmCopyBKernel::emit_brgemm_copy_b_kernel_call(size_t N, size_t K, size_ spill.postamble(); } -void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, const void* src, const void* dst, const void* comp, size_t N, size_t K) { +void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, + const void* src, + const void* dst, + const void* comp, + size_t N, + size_t K) { auto ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t(); ctx.current_N_blk = N; ctx.src = src; @@ -248,10 +313,12 @@ void BrgemmCopyBKernel::execute(matmul::jit_brgemm_matmul_copy_b_t* kernel, cons (*kernel)(&ctx); } -BrgemmCopyBKernelExecutor::BrgemmCopyBKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmCopyBKernelConfig config) - : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) { } +BrgemmCopyBKernelExecutor::BrgemmCopyBKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, + BrgemmCopyBKernelConfig config) + : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) {} -std::shared_ptr BrgemmCopyBKernelExecutor::compile_kernel(const BrgemmCopyBKernelConfig& config) const { +std::shared_ptr BrgemmCopyBKernelExecutor::compile_kernel( + const BrgemmCopyBKernelConfig& config) const { std::shared_ptr compiled_kernel = std::make_shared(); // BrgemmCopyB is not executable - nothing to compile if (!config.is_empty()) { @@ -283,14 +350,16 @@ void BrgemmCopyBKernelExecutor::update_config(const ov::snippets::lowered::Expre const auto& loop_manager = linear_ir->get_loop_manager(); auto init = [&](size_t& dim, size_t& blk, size_t idx) { - OPENVINO_ASSERT(idx < planar_shape.size() && idx < in_subtensor.size(), "Index must be less than shape/subtensor rank!"); + OPENVINO_ASSERT(idx < planar_shape.size() && idx < in_subtensor.size(), + "Index must be less than shape/subtensor rank!"); dim = *(planar_shape.rbegin() + idx); blk = *(in_subtensor.rbegin() + idx); if (ov::snippets::utils::is_full_dim_value(blk)) { blk = dim; } else { OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); - const auto& current_expanded_loop_info = loop_manager->get_loop_info(loop_ids[loop_idx++]); + const auto& current_expanded_loop_info = + loop_manager->get_loop_info(loop_ids[loop_idx++]); blk = current_expanded_loop_info->get_increment(); input_desc->set_subtensor_dim(idx, blk); output_desc->set_subtensor_dim(idx, blk); @@ -306,7 +375,9 @@ void BrgemmCopyBKernelExecutor::update_config(const ov::snippets::lowered::Expre const auto& brg_weight_etype = expr->get_node()->get_input_element_type(0); const auto LDB = brgemm_utils::repacking::compute_LDB(N_dim, brg_weight_etype); - const auto copy_B_wei_stride = ov::snippets::utils::get_dim_stride(expr->get_input_port(0), config.is_transposed_B() ? 0 : 1) * brg_weight_etype.size(); + const auto copy_B_wei_stride = + ov::snippets::utils::get_dim_stride(expr->get_input_port(0), config.is_transposed_B() ? 0 : 1) * + brg_weight_etype.size(); config.update(N_dim, N_blk, K_dim, K_blk, copy_B_wei_stride, LDB); } @@ -318,5 +389,5 @@ void BrgemmCopyBKernelExecutor::execute(const BrgemmCopyBKernelExecutor* executo (*kernel)(args); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp index c4e3f3622ad88f..b3b107cd676705 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp @@ -4,13 +4,12 @@ #pragma once -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" -#include "emitters/snippets/cpu_kernel_executor_table.hpp" - #include #include +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" namespace ov { namespace intel_cpu { @@ -18,11 +17,17 @@ namespace intel_cpu { struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericConfig { public: BrgemmCopyBKernelConfig() = default; - BrgemmCopyBKernelConfig(const element::Type& src_dt, const element::Type& wei_dt, dnnl::impl::cpu::x64::cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk); + BrgemmCopyBKernelConfig(const element::Type& src_dt, + const element::Type& wei_dt, + dnnl::impl::cpu::x64::cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk); bool operator==(const BrgemmCopyBKernelConfig& rhs) const; - bool operator!=(const BrgemmCopyBKernelConfig& rhs) const {return !(*this == rhs);} + bool operator!=(const BrgemmCopyBKernelConfig& rhs) const { + return !(*this == rhs); + } std::unique_ptr get_clone_ptr() const override { return std::unique_ptr(new BrgemmCopyBKernelConfig(*this)); @@ -31,26 +36,61 @@ struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericCon bool is_empty() const; bool is_completed() const override; - void update(dnnl_dim_t N, dnnl_dim_t N_blk, dnnl_dim_t K, dnnl_dim_t K_blk, dnnl_dim_t copy_B_wei_stride, dnnl_dim_t LDB); + void update(dnnl_dim_t N, + dnnl_dim_t N_blk, + dnnl_dim_t K, + dnnl_dim_t K_blk, + dnnl_dim_t copy_B_wei_stride, + dnnl_dim_t LDB); - size_t hash() const override { return m_hash; } + size_t hash() const override { + return m_hash; + } - dnnl_data_type_t get_src_dt() const { return m_static_params->src_dt; } - dnnl_data_type_t get_wei_dt() const { return m_static_params->wei_dt; } + dnnl_data_type_t get_src_dt() const { + return m_static_params->src_dt; + } + dnnl_data_type_t get_wei_dt() const { + return m_static_params->wei_dt; + } - dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { return m_static_params->isa; } - bool is_with_comp() const { return m_static_params->is_with_comp; } - bool is_transposed_B() const { return m_static_params->is_transposed_B; } + dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { + return m_static_params->isa; + } + bool is_with_comp() const { + return m_static_params->is_with_comp; + } + bool is_transposed_B() const { + return m_static_params->is_transposed_B; + } - dnnl_dim_t get_N() const { return m_N; } - dnnl_dim_t get_N_blk() const { return m_N_blk; } - dnnl_dim_t get_N_tail() const { return m_N % m_N_blk; } - dnnl_dim_t get_wei_N_blk() const { return m_static_params->wei_N_blk; } - dnnl_dim_t get_wei_N_tail() const { return m_N_blk % m_static_params->wei_N_blk; } - dnnl_dim_t get_K() const { return m_K; } - dnnl_dim_t get_K_blk() const { return m_K_blk; } - dnnl_dim_t get_copy_B_wei_stride() const { return m_copy_B_wei_stride; } - dnnl_dim_t get_LDB() const { return m_LDB; } + dnnl_dim_t get_N() const { + return m_N; + } + dnnl_dim_t get_N_blk() const { + return m_N_blk; + } + dnnl_dim_t get_N_tail() const { + return m_N % m_N_blk; + } + dnnl_dim_t get_wei_N_blk() const { + return m_static_params->wei_N_blk; + } + dnnl_dim_t get_wei_N_tail() const { + return m_N_blk % m_static_params->wei_N_blk; + } + dnnl_dim_t get_K() const { + return m_K; + } + dnnl_dim_t get_K_blk() const { + return m_K_blk; + } + dnnl_dim_t get_copy_B_wei_stride() const { + return m_copy_B_wei_stride; + } + dnnl_dim_t get_LDB() const { + return m_LDB; + } #ifdef SNIPPETS_DEBUG_CAPS std::string to_string() const override; @@ -58,35 +98,45 @@ struct BrgemmCopyBKernelConfig : public snippets::KernelExecutorBase::GenericCon private: struct StaticParams { - StaticParams(const element::Type& src_dt, const element::Type& wei_dt, dnnl::impl::cpu::x64::cpu_isa_t isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk); - - const dnnl_data_type_t src_dt {dnnl_data_type_undef}, wei_dt {dnnl_data_type_undef}; - const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::isa_undef}; - const bool is_with_comp {false}; - const bool is_transposed_B {false}; - const dnnl_dim_t wei_N_blk {0}; - const size_t hash {0}; + StaticParams(const element::Type& src_dt, + const element::Type& wei_dt, + dnnl::impl::cpu::x64::cpu_isa_t isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk); + + const dnnl_data_type_t src_dt{dnnl_data_type_undef}, wei_dt{dnnl_data_type_undef}; + const dnnl::impl::cpu::x64::cpu_isa_t isa{dnnl::impl::cpu::x64::isa_undef}; + const bool is_with_comp{false}; + const bool is_transposed_B{false}; + const dnnl_dim_t wei_N_blk{0}; + const size_t hash{0}; bool operator==(const StaticParams& rhs) const; - bool operator!=(const StaticParams& rhs) const { return !(*this == rhs); } + bool operator!=(const StaticParams& rhs) const { + return !(*this == rhs); + } #ifdef SNIPPETS_DEBUG_CAPS std::string to_string() const; #endif private: - static size_t init_hash(const dnnl_data_type_t& src_dt, const dnnl_data_type_t& wei_dt, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa, - bool is_with_comp, bool is_transposed_B, dnnl_dim_t wei_N_blk); + static size_t init_hash(const dnnl_data_type_t& src_dt, + const dnnl_data_type_t& wei_dt, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa, + bool is_with_comp, + bool is_transposed_B, + dnnl_dim_t wei_N_blk); }; size_t compute_hash() const; std::shared_ptr m_static_params; - dnnl_dim_t m_N {0}, m_N_blk {0}; - dnnl_dim_t m_K {0}, m_K_blk {0}; - dnnl_dim_t m_copy_B_wei_stride {0}, m_LDB {0}; - size_t m_hash {SIZE_MAX}; + dnnl_dim_t m_N{0}, m_N_blk{0}; + dnnl_dim_t m_K{0}, m_K_blk{0}; + dnnl_dim_t m_copy_B_wei_stride{0}, m_LDB{0}; + size_t m_hash{SIZE_MAX}; }; struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator { @@ -109,8 +159,12 @@ struct BrgemmCopyBKernel : public dnnl::impl::cpu::x64::jit_generator { void emit_brgemm_copy_b_kernel_call(size_t N, size_t K, size_t offset_in, size_t offset_out, size_t offset_comp); - static void execute(dnnl::impl::cpu::x64::matmul::jit_brgemm_matmul_copy_b_t* kernel, const void* src, const void* dst, const void* comp, - size_t N, size_t K); + static void execute(dnnl::impl::cpu::x64::matmul::jit_brgemm_matmul_copy_b_t* kernel, + const void* src, + const void* dst, + const void* comp, + size_t N, + size_t K); void init_brgemm_copy_b_kernel(std::unique_ptr& kernel, const BrgemmCopyBKernelConfig& conf) const; @@ -151,5 +205,5 @@ class BrgemmCopyBKernelExecutor : public CPUKernelExecutor(); size_t id = SIZE_MAX; switch (port.get_type()) { - case ov::snippets::lowered::ExpressionPort::Type::Input: - offset = ma_op->get_input_offset(port.get_index()); - id = get_cluster_id(port.get_port_connector_ptr()->get_source()); - break; - case ov::snippets::lowered::ExpressionPort::Type::Output: - offset = ma_op->get_output_offset(port.get_index()); - for (const auto& child : port.get_connected_ports()) - if (!ov::is_type(child.get_expr()->get_node())) - id = get_cluster_id(child); - break; - default: - OV_CPU_JIT_EMITTER_THROW("Uknown type of expression port!"); + case ov::snippets::lowered::ExpressionPort::Type::Input: + offset = ma_op->get_input_offset(port.get_index()); + id = get_cluster_id(port.get_port_connector_ptr()->get_source()); + break; + case ov::snippets::lowered::ExpressionPort::Type::Output: + offset = ma_op->get_output_offset(port.get_index()); + for (const auto& child : port.get_connected_ports()) + if (!ov::is_type(child.get_expr()->get_node())) + id = get_cluster_id(child); + break; + default: + OV_CPU_JIT_EMITTER_THROW("Uknown type of expression port!"); } OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(ov::snippets::utils::is_dynamic_value(offset), id != SIZE_MAX), "In dynamic case Buffer Cluster ID must be known!"); @@ -46,31 +45,41 @@ size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port) Xbyak::Reg64 get_aux_gpr(const std::vector& used_gpr_idxs) { // RSP, RBP - stack-related registers, abi_param1 - runtime parameter register in the kernel - static std::unordered_set blacklist_gpr_idxs = { Xbyak::Operand::RSP, Xbyak::Operand::RBP, static_cast(abi_param1.getIdx()) }; + static std::unordered_set blacklist_gpr_idxs = {Xbyak::Operand::RSP, + Xbyak::Operand::RBP, + static_cast(abi_param1.getIdx())}; for (size_t gpr_idx = 0; gpr_idx <= Xbyak::Operand::R15; ++gpr_idx) { - size_t _idx = Xbyak::Operand::R15 - gpr_idx; // we allocate from the end - if (std::find(used_gpr_idxs.cbegin(), used_gpr_idxs.cend(), _idx) != used_gpr_idxs.cend()) continue; - if (blacklist_gpr_idxs.count(_idx) > 0) continue; + size_t _idx = Xbyak::Operand::R15 - gpr_idx; // we allocate from the end + if (std::find(used_gpr_idxs.cbegin(), used_gpr_idxs.cend(), _idx) != used_gpr_idxs.cend()) + continue; + if (blacklist_gpr_idxs.count(_idx) > 0) + continue; return Xbyak::Reg64(_idx); } OV_CPU_JIT_EMITTER_THROW("Failed to allocate aux GPR"); } -void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, Xbyak::Reg64 aux_reg, size_t runtime_offset) { +void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + Xbyak::Reg64 aux_reg, + size_t runtime_offset) { const auto stack_frame = h->qword[h->rsp + stack_offset]; h->mov(aux_reg, ptr_reg); h->add(aux_reg, h->ptr[abi_param1 + runtime_offset]); h->mov(stack_frame, aux_reg); } -void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, size_t ptr_offset) { +void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + size_t ptr_offset) { const auto stack_frame = h->qword[h->rsp + stack_offset]; h->mov(stack_frame, ptr_reg); - if (ptr_offset != 0) h->add(stack_frame, ptr_offset); + if (ptr_offset != 0) + h->add(stack_frame, ptr_offset); } -} // namespace utils -} // namespace intel_cpu -} // namespace ov +} // namespace utils +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp index 97ea86f404fd67..3d8026ea33c750 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/utils.hpp @@ -13,13 +13,17 @@ namespace utils { inline static std::vector transform_idxs_to_regs(const std::vector& idxs) { std::vector regs(idxs.size()); - std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx){return Xbyak::Reg64(static_cast(idx));}); + std::transform(idxs.begin(), idxs.end(), regs.begin(), [](size_t idx) { + return Xbyak::Reg64(static_cast(idx)); + }); return regs; } inline static std::vector transform_snippets_regs_to_idxs(const std::vector& regs) { std::vector idxs(regs.size()); - std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { return reg.idx; }); + std::transform(regs.cbegin(), regs.cend(), idxs.begin(), [](const snippets::Reg& reg) { + return reg.idx; + }); return idxs; } @@ -46,8 +50,11 @@ Xbyak::Reg64 get_aux_gpr(const std::vector& used_gpr_idxs); * @param aux_reg aux register * @param runtime_offset offset in runtime params `abi_param1` */ -void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, Xbyak::Reg64 aux_reg, size_t runtime_offset); +void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + Xbyak::Reg64 aux_reg, + size_t runtime_offset); /** * @brief Push data pointer on stack adding static offset `ptr_offset` @@ -56,9 +63,11 @@ void push_ptr_with_runtime_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* * @param ptr_reg register contains data pointer * @param ptr_offset offset which will be added to data pointer */ -void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, size_t stack_offset, - Xbyak::Reg64 ptr_reg, size_t ptr_offset); +void push_ptr_with_static_offset_on_stack(dnnl::impl::cpu::x64::jit_generator* h, + size_t stack_offset, + Xbyak::Reg64 ptr_reg, + size_t ptr_offset); -} // namespace utils -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace utils +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp index 78563bc00aa228..9ac7f0d5cd0ffc 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp @@ -4,17 +4,20 @@ #ifdef SNIPPETS_DEBUG_CAPS -#include "verbose.hpp" -#include "jit_segfault_detector_emitter.hpp" -#include "jit_memory_emitters.hpp" -#include "jit_brgemm_emitter.hpp" -#include "jit_brgemm_copy_b_emitter.hpp" -#include "jit_kernel_emitter.hpp" -#include "jit_snippets_emitters.hpp" - -#ifndef _WIN32 -#include -#endif +# include "verbose.hpp" + +# include "jit_brgemm_copy_b_emitter.hpp" +# include "jit_brgemm_emitter.hpp" +# include "jit_kernel_emitter.hpp" +# include "jit_memory_emitters.hpp" +# include "jit_segfault_detector_emitter.hpp" +# include "jit_snippets_emitters.hpp" +# include "kernel_executors/brgemm.hpp" +# include "kernel_executors/brgemm_amx.hpp" + +# ifndef _WIN32 +# include +# endif namespace ov { namespace intel_cpu { @@ -41,63 +44,59 @@ std::string vector_to_string(const T& v) { std::string get_emitter_type_name(const jit_emitter* emitter) { std::string name = typeid(*emitter).name(); -#ifndef _WIN32 +# ifndef _WIN32 int status; - std::unique_ptr demangled_name( - abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), - std::free); + std::unique_ptr demangled_name(abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), + std::free); name = demangled_name.get(); -#endif +# endif return name; } -std::string init_info_jit_memory_emitter(const jit_memory_emitter *emitter) { +std::string init_info_jit_memory_emitter(const jit_memory_emitter* emitter) { std::stringstream ss; - ss << " src_precision:" << emitter->src_prc - << " dst_precision:" << emitter->dst_prc - << " load/store_element_number:" << emitter->count - << " byte_offset:" << emitter->compiled_byte_offset; + ss << " src_precision:" << emitter->src_prc << " dst_precision:" << emitter->dst_prc + << " load/store_element_number:" << emitter->count << " byte_offset:" << emitter->compiled_byte_offset; return ss.str(); } -static std::string init_info_jit_load_memory_emitter(const jit_load_memory_emitter *emitter) { +static std::string init_info_jit_load_memory_emitter(const jit_load_memory_emitter* emitter) { std::stringstream ss; std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_load_memory_emitter" - << memory_emitter_info; + ss << "Emitter_type_name:jit_load_memory_emitter" << memory_emitter_info; return ss.str(); } -static std::string init_info_jit_load_broadcast_emitter(const jit_load_broadcast_emitter *emitter) { +static std::string init_info_jit_load_broadcast_emitter(const jit_load_broadcast_emitter* emitter) { std::stringstream ss; std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_load_broadcast_emitter" - << memory_emitter_info; + ss << "Emitter_type_name:jit_load_broadcast_emitter" << memory_emitter_info; return ss.str(); } -static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emitter *emitter) { +static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emitter* emitter) { std::stringstream ss; std::string memory_emitter_info = init_info_jit_memory_emitter(emitter); - ss << "Emitter_type_name:jit_store_memory_emitter" - << memory_emitter_info; + ss << "Emitter_type_name:jit_store_memory_emitter" << memory_emitter_info; return ss.str(); } -std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter) { +std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter* emitter) { std::stringstream ss; - ss << "Emitter_type_name:jit_brgemm_emitter" - << emitter->m_kernel_executor->to_string() - << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) + ss << "Emitter_type_name:jit_brgemm_emitter"; + if (const auto& common = std::dynamic_pointer_cast(emitter->m_kernel_executor)) + ss << common->to_string(); + if (const auto& amx = std::dynamic_pointer_cast(emitter->m_kernel_executor)) + ss << amx->to_string(); + ss << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) << " m_buffer_ids:" << vector_to_string(emitter->m_buffer_ids); return ss.str(); } -std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter *emitter) { +std::string init_info_jit_brgemm_copy_b_emitter(const jit_brgemm_copy_b_emitter* emitter) { std::stringstream ss; - ss << "Emitter_type_name:jit_brgemm_copy_b_emitter" - << emitter->m_kernel_executor->to_string() + ss << "Emitter_type_name:jit_brgemm_copy_b_emitter" << emitter->m_kernel_executor->to_string() << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) << " m_buffer_ids:" << vector_to_string(emitter->m_buffer_ids); @@ -108,11 +107,9 @@ std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* std::stringstream ss; ss << "Emitter_type_name:jit_kernel_static_emitter" << " jcp.exec_domain:" << vector_to_string(emitter->jcp.exec_domain) - << " gp_regs_pool:"<< vector_to_string(emitter->gp_regs_pool) - << " master_shape:" << vector_to_string(emitter->master_shape) - << " num_inputs:" << emitter->num_inputs - << " num_outputs:" << emitter->num_outputs - << " num_unique_buffers:" << emitter->num_unique_buffers + << " gp_regs_pool:" << vector_to_string(emitter->gp_regs_pool) + << " master_shape:" << vector_to_string(emitter->master_shape) << " num_inputs:" << emitter->num_inputs + << " num_outputs:" << emitter->num_outputs << " num_unique_buffers:" << emitter->num_unique_buffers << " data_ptr_regs_idx:" << vector_to_string(emitter->data_ptr_regs_idx) << " vec_regs_pool:" << vector_to_string(emitter->vec_regs_pool) << " reg_indexes_idx:" << emitter->reg_indexes_idx @@ -125,24 +122,20 @@ std::string init_info_jit_kernel_static_emitter(const jit_kernel_static_emitter* std::string init_info_jit_kernel_dynamic_emitter(const jit_kernel_dynamic_emitter* emitter) { std::stringstream ss; ss << "Emitter_type_name:jit_kernel_dynamic_emitter" - << " gp_regs_pool:"<< vector_to_string(emitter->gp_regs_pool) - << " num_inputs:" << emitter->num_inputs - << " num_outputs:" << emitter->num_outputs - << " num_unique_buffers:" << emitter->num_unique_buffers + << " gp_regs_pool:" << vector_to_string(emitter->gp_regs_pool) << " num_inputs:" << emitter->num_inputs + << " num_outputs:" << emitter->num_outputs << " num_unique_buffers:" << emitter->num_unique_buffers << " data_ptr_regs_idx:" << vector_to_string(emitter->data_ptr_regs_idx) << " vec_regs_pool:" << vector_to_string(emitter->vec_regs_pool) << " reg_runtime_params_idx:" << emitter->reg_runtime_params_idx; return ss.str(); } -std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter *emitter) { +std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_detector_emitter* emitter) { std::stringstream ss; - ss << "Node_name:" << emitter->m_target_node_name - << " use_load_emitter:"<< emitter->is_target_use_load_emitter - << " use_store_emitter:"<< emitter->is_target_use_store_emitter; + ss << "Node_name:" << emitter->m_target_node_name << " use_load_emitter:" << emitter->is_target_use_load_emitter + << " use_store_emitter:" << emitter->is_target_use_store_emitter; if (emitter->is_target_use_load_emitter || emitter->is_target_use_store_emitter) { - ss << " start_address:" << emitter->start_address - << " current_address:" << emitter->current_address + ss << " start_address:" << emitter->start_address << " current_address:" << emitter->current_address << " iteration:" << emitter->iteration << " "; } // traget emitter info @@ -152,14 +145,15 @@ std::string init_info_jit_uni_segfault_detector_emitter(const jit_uni_segfault_d return ss.str(); } -static std::string init_info_jit_emitter_general(const jit_emitter *emitter) { +static std::string init_info_jit_emitter_general(const jit_emitter* emitter) { std::stringstream ss; ss << "Emitter_type_name:" << get_emitter_type_name(emitter); return ss.str(); } -void jit_emitter_info_t::init(const jit_emitter *emitter) { - if (is_initialized_) return; +void jit_emitter_info_t::init(const jit_emitter* emitter) { + if (is_initialized_) + return; if (auto e_type = dynamic_cast(emitter)) { str_ = init_info_jit_load_memory_emitter(e_type); } else if (auto e_type = dynamic_cast(emitter)) { @@ -182,7 +176,7 @@ void jit_emitter_info_t::init(const jit_emitter *emitter) { is_initialized_ = true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp index a81364039b98a7..ffbe210f75d2ff 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.hpp @@ -4,27 +4,30 @@ #ifdef SNIPPETS_DEBUG_CAPS -#pragma once +# pragma once -#include +# include namespace ov { namespace intel_cpu { class jit_emitter; struct jit_emitter_info_t { jit_emitter_info_t() = default; - jit_emitter_info_t(const jit_emitter_info_t &rhs) - : str_(rhs.str_), is_initialized_(rhs.is_initialized_) {} - jit_emitter_info_t &operator=(const jit_emitter_info_t &rhs) { + jit_emitter_info_t(const jit_emitter_info_t& rhs) : str_(rhs.str_), is_initialized_(rhs.is_initialized_) {} + jit_emitter_info_t& operator=(const jit_emitter_info_t& rhs) { is_initialized_ = rhs.is_initialized_; str_ = rhs.str_; return *this; } - const char *c_str() const { return str_.c_str(); } - bool is_initialized() const { return is_initialized_; } + const char* c_str() const { + return str_.c_str(); + } + bool is_initialized() const { + return is_initialized_; + } - void init(const jit_emitter *emitter); + void init(const jit_emitter* emitter); private: std::string str_; @@ -33,7 +36,7 @@ struct jit_emitter_info_t { std::string get_emitter_type_name(const jit_emitter* emitter); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov #endif \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/utils.cpp b/src/plugins/intel_cpu/src/emitters/utils.cpp index b92277ae643218..43172e1b600843 100644 --- a/src/plugins/intel_cpu/src/emitters/utils.cpp +++ b/src/plugins/intel_cpu/src/emitters/utils.cpp @@ -7,28 +7,29 @@ namespace ov { namespace intel_cpu { -std::string jit_emitter_pretty_name(const std::string &pretty_func) { -#define SAFE_SYMBOL_FINDING(idx, find) \ - auto idx = (find); \ +std::string jit_emitter_pretty_name(const std::string& pretty_func) { +#define SAFE_SYMBOL_FINDING(idx, find) \ + auto idx = (find); \ if (idx == std::string::npos || idx == 0) \ return pretty_func; // Example: - // pretty_func := void ov::intel_cpu::jit_load_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const - // begin := -----------| - // end := ---------------------------------------------------| - // result := ov::intel_cpu::jit_load_memory_emitter + // pretty_func := void ov::intel_cpu::jit_load_memory_emitter::emit_impl(const std::vector& in, const + // std::vector& out) const begin := -----------| end := + // ---------------------------------------------------| result := ov::intel_cpu::jit_load_memory_emitter // Signatures: // GCC: void foo() [with T = {type}] // clang: void foo() [T = {type}] // MSVC: void __cdecl foo<{type}>(void) SAFE_SYMBOL_FINDING(parenthesis, pretty_func.find("(")) - if (pretty_func[parenthesis - 1] == '>') { // To cover template on MSVC + if (pretty_func[parenthesis - 1] == '>') { // To cover template on MSVC parenthesis--; size_t counter = 1; while (counter != 0 && parenthesis > 0) { parenthesis--; - if (pretty_func[parenthesis] == '>') counter++; - if (pretty_func[parenthesis] == '<') counter--; + if (pretty_func[parenthesis] == '>') + counter++; + if (pretty_func[parenthesis] == '<') + counter--; } } SAFE_SYMBOL_FINDING(end, pretty_func.substr(0, parenthesis).rfind("::")) @@ -38,5 +39,5 @@ std::string jit_emitter_pretty_name(const std::string &pretty_func) { return end > begin ? pretty_func.substr(begin, end - begin) : pretty_func; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/utils.hpp b/src/plugins/intel_cpu/src/emitters/utils.hpp index 4c3210579d7fd2..7c89b720159dde 100644 --- a/src/plugins/intel_cpu/src/emitters/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/utils.hpp @@ -5,21 +5,22 @@ #pragma once #include + #include "openvino/core/except.hpp" namespace ov { namespace intel_cpu { -std::string jit_emitter_pretty_name(const std::string &pretty_func); +std::string jit_emitter_pretty_name(const std::string& pretty_func); #ifdef __GNUC__ -#define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__PRETTY_FUNCTION__) +# define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__PRETTY_FUNCTION__) #else /* __GNUC__ */ -#define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__FUNCSIG__) +# define OV_CPU_JIT_EMITTER_NAME jit_emitter_pretty_name(__FUNCSIG__) #endif /* __GNUC__ */ -#define OV_CPU_JIT_EMITTER_THROW(...) OPENVINO_THROW(OV_CPU_JIT_EMITTER_NAME, ": ", __VA_ARGS__) +#define OV_CPU_JIT_EMITTER_THROW(...) OPENVINO_THROW(OV_CPU_JIT_EMITTER_NAME, ": ", __VA_ARGS__) #define OV_CPU_JIT_EMITTER_ASSERT(cond, ...) OPENVINO_ASSERT((cond), OV_CPU_JIT_EMITTER_NAME, ": ", __VA_ARGS__) -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index a29282d4af3101..bdb5211009a22a 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -7,6 +7,10 @@ #include "openvino/core/op_extension.hpp" #include "ov_ops/augru_cell.hpp" #include "ov_ops/augru_sequence.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_compressed.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" #include "ov_ops/gather_compressed.hpp" #include "ov_ops/multiclass_nms_ie_internal.hpp" #include "ov_ops/nms_ie_internal.hpp" @@ -16,15 +20,14 @@ #include "ov_ops/type_relaxed.hpp" #include "snippets/op/subgraph.hpp" #include "transformations/cpu_opset/common/op/causal_mask_preprocess.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" #include "transformations/cpu_opset/common/op/leaky_relu.hpp" #include "transformations/cpu_opset/common/op/ngram.hpp" #include "transformations/cpu_opset/common/op/power_static.hpp" #include "transformations/cpu_opset/common/op/sdpa.hpp" #include "transformations/cpu_opset/common/op/swish_cpu.hpp" #include "transformations/cpu_opset/x64/op/interaction.hpp" -#include "transformations/cpu_opset/x64/op/mha.hpp" #include "transformations/cpu_opset/x64/op/llm_mlp.hpp" +#include "transformations/cpu_opset/x64/op/mha.hpp" #include "transformations/cpu_opset/x64/op/qkv_proj.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -37,8 +40,7 @@ namespace { template class TypeRelaxedExtension : public ov::OpExtension> { public: - TypeRelaxedExtension() - : m_ext_type(Op::get_type_info_static().name, "type_relaxed_opset") {} + TypeRelaxedExtension() : m_ext_type(Op::get_type_info_static().name, "type_relaxed_opset") {} ~TypeRelaxedExtension() override = default; const ov::DiscreteTypeInfo& get_type_info() const override { @@ -70,7 +72,6 @@ class TypeRelaxedExtension : public ov::OpExtension> { #endif #define CPU_EXTENSIONS \ - OP_EXTENSION(ov::intel_cpu::FullyConnectedNode) \ OP_EXTENSION(ov::intel_cpu::LeakyReluNode) \ OP_EXTENSION(ov::intel_cpu::PowerStaticNode) \ OP_EXTENSION(ov::intel_cpu::CausalMaskPreprocessNode) \ @@ -85,6 +86,10 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::op::internal::NmsStaticShapeIE) \ OP_EXTENSION(ov::op::internal::RMS) \ OP_EXTENSION(ov::op::internal::RoPE) \ + OP_EXTENSION(ov::op::internal::FullyConnected) \ + OP_EXTENSION(ov::op::internal::FullyConnectedCompressed) \ + OP_EXTENSION(ov::op::internal::FullyConnectedQuantizedLegacy) \ + OP_EXTENSION(ov::op::internal::FullyConnectedQuantized) \ OP_EXTENSION_X64(ov::intel_cpu::MHANode) \ OP_EXTENSION_X64(ov::intel_cpu::InteractionNode) \ OP_EXTENSION_X64(ov::intel_cpu::LLMMLPNode) \ @@ -153,31 +158,31 @@ class TypeRelaxedExtension : public ov::OpExtension> { # define SNIPPETS_DEBUG_CAPS_EXTENSIONS #endif -#define SNIPPETS_EXTENSIONS \ - OP_EXTENSION(ov::snippets::op::Brgemm) \ - OP_EXTENSION(ov::snippets::op::BroadcastLoad) \ - OP_EXTENSION(ov::snippets::op::BroadcastMove) \ - OP_EXTENSION(ov::snippets::op::ConvertSaturation) \ - OP_EXTENSION(ov::snippets::op::ConvertTruncation) \ - OP_EXTENSION(ov::snippets::op::Fill) \ - OP_EXTENSION(ov::snippets::op::HorizonMax) \ - OP_EXTENSION(ov::snippets::op::HorizonSum) \ - OP_EXTENSION(ov::snippets::op::KernelStatic) \ - OP_EXTENSION(ov::snippets::op::KernelDynamic) \ - OP_EXTENSION(ov::snippets::op::Load) \ - OP_EXTENSION(ov::snippets::op::LoadReshape) \ - OP_EXTENSION(ov::snippets::op::LoopBegin) \ - OP_EXTENSION(ov::snippets::op::LoopEnd) \ - OP_EXTENSION(ov::snippets::op::Buffer) \ - OP_EXTENSION(ov::snippets::op::Nop) \ - OP_EXTENSION(ov::snippets::op::PowerStatic) \ - OP_EXTENSION(ov::snippets::op::Scalar) \ - OP_EXTENSION(ov::snippets::op::Store) \ - OP_EXTENSION(ov::snippets::op::Subgraph) \ - OP_EXTENSION(ov::snippets::op::VectorBuffer) \ - OP_EXTENSION(ov::snippets::op::RankNormalization) \ - OP_EXTENSION(ov::snippets::op::ReduceMax) \ - OP_EXTENSION(ov::snippets::op::ReduceSum) \ +#define SNIPPETS_EXTENSIONS \ + OP_EXTENSION(ov::snippets::op::Brgemm) \ + OP_EXTENSION(ov::snippets::op::BroadcastLoad) \ + OP_EXTENSION(ov::snippets::op::BroadcastMove) \ + OP_EXTENSION(ov::snippets::op::ConvertSaturation) \ + OP_EXTENSION(ov::snippets::op::ConvertTruncation) \ + OP_EXTENSION(ov::snippets::op::Fill) \ + OP_EXTENSION(ov::snippets::op::HorizonMax) \ + OP_EXTENSION(ov::snippets::op::HorizonSum) \ + OP_EXTENSION(ov::snippets::op::KernelStatic) \ + OP_EXTENSION(ov::snippets::op::KernelDynamic) \ + OP_EXTENSION(ov::snippets::op::Load) \ + OP_EXTENSION(ov::snippets::op::LoadReshape) \ + OP_EXTENSION(ov::snippets::op::LoopBegin) \ + OP_EXTENSION(ov::snippets::op::LoopEnd) \ + OP_EXTENSION(ov::snippets::op::Buffer) \ + OP_EXTENSION(ov::snippets::op::Nop) \ + OP_EXTENSION(ov::snippets::op::PowerStatic) \ + OP_EXTENSION(ov::snippets::op::Scalar) \ + OP_EXTENSION(ov::snippets::op::Store) \ + OP_EXTENSION(ov::snippets::op::Subgraph) \ + OP_EXTENSION(ov::snippets::op::VectorBuffer) \ + OP_EXTENSION(ov::snippets::op::RankNormalization) \ + OP_EXTENSION(ov::snippets::op::ReduceMax) \ + OP_EXTENSION(ov::snippets::op::ReduceSum) \ OP_EXTENSION(ov::snippets::op::Reshape) OPENVINO_CREATE_EXTENSIONS(std::vector( diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 6aa4644f902bc9..7fb5f512227cf9 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include +#include "common/primitive_desc_iface.hpp" #include "edge.h" #include "graph_dumper.h" #include "graph_optimizer.h" @@ -28,25 +30,21 @@ #include "nodes/common/cpu_memcpy.h" #include "nodes/convert.h" #include "nodes/input.h" -#include "nodes/reorder.h" #include "nodes/memory.hpp" +#include "nodes/reorder.h" #include "openvino/core/except.hpp" #include "openvino/core/model.hpp" #include "openvino/core/node.hpp" +#include "openvino/core/parallel.hpp" #include "openvino/core/type/element_type.hpp" +#include "openvino/runtime/exception.hpp" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" #include "utils/node_dumper.h" -#include "utils/verbose.h" #include "utils/precision_support.h" - -#include -#include "common/primitive_desc_iface.hpp" - -#include "openvino/runtime/exception.hpp" -#include "openvino/runtime/threading/cpu_streams_executor.hpp" -#include "openvino/core/parallel.hpp" +#include "utils/verbose.h" #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) # include @@ -61,8 +59,8 @@ Graph::~Graph() { CPU_DEBUG_CAP_ENABLE(average_counters(*this)); } -template -void Graph::CreateGraph(NET &model, const GraphContext::CPtr context) { +template +void Graph::CreateGraph(NET& model, const GraphContext::CPtr context) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "CreateGraph"); Init(model, context); @@ -104,7 +102,7 @@ void Graph::CreateGraph(const std::vector& graphNodes, template void Graph::CreateGraph(const std::shared_ptr&, const GraphContext::CPtr); -void Graph::Replicate(const std::shared_ptr &model, +void Graph::Replicate(const std::shared_ptr& model, const std::vector& inputConfigs, const std::vector& outputConfigs) { OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model"); @@ -135,7 +133,9 @@ void Graph::Replicate(const std::shared_ptr &model, if (op->get_type_info() == op::v0::Parameter::get_type_info_static()) { auto input_index = model->get_parameter_index(std::dynamic_pointer_cast(op)); OPENVINO_ASSERT(input_index >= 0, - "CPU plugin cannot find op: ", op->get_friendly_name(), " in model parameter list!"); + "CPU plugin cannot find op: ", + op->get_friendly_name(), + " in model parameter list!"); const auto& config = static_cast(input_index) < inputConfigs.size() ? inputConfigs[input_index] : node::Input::InputConfig{}; @@ -152,7 +152,9 @@ void Graph::Replicate(const std::shared_ptr &model, if (op->get_type_info() == op::v0::Result::get_type_info_static()) { auto output_index = model->get_result_index(std::dynamic_pointer_cast(op)); OPENVINO_ASSERT(output_index >= 0, - "CPU plugin cannot find op: ", op->get_friendly_name(), " in model result list!"); + "CPU plugin cannot find op: ", + op->get_friendly_name(), + " in model result list!"); const auto& config = static_cast(output_index) < outputConfigs.size() ? outputConfigs[output_index] : node::Input::OutputConfig{}; @@ -179,9 +181,9 @@ void Graph::Replicate(const std::shared_ptr &model, } if (!one_of(op->get_type_info(), - op::v0::Result::get_type_info_static(), - op::v3::Assign::get_type_info_static(), - op::v6::Assign::get_type_info_static())) { + op::v0::Result::get_type_info_static(), + op::v3::Assign::get_type_info_static(), + op::v6::Assign::get_type_info_static())) { for (size_t oi = 0; oi < op->get_output_size(); oi++) { if (op->get_output_target_inputs(oi).empty()) { unusedOutputs.push_back(op->output(oi)); @@ -194,10 +196,13 @@ void Graph::Replicate(const std::shared_ptr &model, for (auto unusedOutput : unusedOutputs) { auto parentNode = op2node[unusedOutput.get_node_shared_ptr()]; const auto port = unusedOutput.get_index(); - const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName(); + const auto nodeName = + std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName(); const NodePtr outNode = std::make_shared(parentNode->outputShapes[port], parentNode->getOriginalOutputPrecisionAtPort(port), - nodeName, "Result", m_context); + nodeName, + "Result", + m_context); CreateEdge(parentNode, outNode, port, 0); AddNode(outNode); } @@ -216,7 +221,7 @@ void Graph::Replicate(const std::shared_ptr &model, EnforceInferencePrecision(); // update input precisions of consumers to avoid extra reorders - for (auto &input : inputNodesMap) { + for (auto& input : inputNodesMap) { const auto& inputNode = input.second; const auto precToSet = inputNode->getOriginalOutputPrecisionAtPort(0); const auto childEdges = inputNode->getChildEdgesAtPort(0); @@ -233,7 +238,7 @@ void Graph::Replicate(const std::shared_ptr &model, // update output precisions of producers to avoid extra reorders // do this only in case output configration is not provided explicitly if (outputConfigs.empty()) { - for (auto &output : outputNodesMap) { + for (auto& output : outputNodesMap) { const auto& outputNode = output.second; const auto precToSet = outputNode->getOriginalInputPrecisionAtPort(0); const auto parentEdge = outputNode->getParentEdgeAt(0); @@ -254,11 +259,12 @@ static std::vector IdentifySyncPoints(const std::vector& graphN continue; if (node->outputShapeDataDependency() || - // WA: for convolution plus sum(broadcast). Due to the fact that a convolution with sum use the same memory for second sum term and the output - // tensors (inPlace) resizing the output tensor, may lead to reallocation of this second term memory and possible data lost. The reallocation - // may happen when the second term shape is broadcasted to the output tensor shape. To avoid the data loss, we have a special processing for - // such cases inside the convolution node, but it works properly only when dynamic shapes inference, preparation and execution a called - // for this node sequentially. + // WA: for convolution plus sum(broadcast). Due to the fact that a convolution with sum use the same memory + // for second sum term and the output tensors (inPlace) resizing the output tensor, may lead to reallocation + // of this second term memory and possible data lost. The reallocation may happen when the second term shape + // is broadcasted to the output tensor shape. To avoid the data loss, we have a special processing for such + // cases inside the convolution node, but it works properly only when dynamic shapes inference, preparation + // and execution a called for this node sequentially. (node->getType() == Type::Convolution && node->isInPlace()) || // Due to the special handling of the internal states and initialization subgraphs, MemoryInput nodes must // be processed as a internal dynamism node, allowing to hide the aforementioned complexity inside the @@ -271,15 +277,17 @@ static std::vector IdentifySyncPoints(const std::vector& graphN return syncNodesInds; } -static std::tuple, std::vector> ExtractExecutableNodesAndSyncPoints(const std::vector& syncNodesInds, - const std::vector& graphNodes) { +static std::tuple, std::vector> ExtractExecutableNodesAndSyncPoints( + const std::vector& syncNodesInds, + const std::vector& graphNodes) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ExtractExecutableNodesAndSyncPoints"); std::unordered_map graphIdToExecutableId; std::vector executableGraphNodes; for (size_t i = 0; i < graphNodes.size(); i++) { const auto& graphNode = graphNodes[i]; - if ((!graphNode->isConstant() && graphNode->isExecutable()) || // non-constant executable or - (graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs + if ((!graphNode->isConstant() && graphNode->isExecutable()) || // non-constant executable or + (graphNode->isDynamicNode() && + !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs graphIdToExecutableId[i] = executableGraphNodes.size(); executableGraphNodes.emplace_back(graphNode); } @@ -291,17 +299,17 @@ static std::tuple, std::vector> ExtractExecutableNo auto it = graphIdToExecutableId.find(syncNodesInd); if (it != graphIdToExecutableId.end()) { uniqueExecutableSyncNodesInds.insert(it->second); - // since sometimes we need to run the synchronization node alone (for example in the case of internal dynamism) - // let's add another sync index after the sync point node + // since sometimes we need to run the synchronization node alone (for example in the case of internal + // dynamism) let's add another sync index after the sync point node uniqueExecutableSyncNodesInds.insert(it->second + 1); } } uniqueExecutableSyncNodesInds.insert(executableGraphNodes.size()); // convert to a vector to reduce runtime overhead - std::vector executableSyncNodesInds(uniqueExecutableSyncNodesInds.begin(), uniqueExecutableSyncNodesInds.end()); + std::vector executableSyncNodesInds(uniqueExecutableSyncNodesInds.begin(), + uniqueExecutableSyncNodesInds.end()); - return std::make_tuple(std::move(executableGraphNodes), - std::move(executableSyncNodesInds)); + return std::make_tuple(std::move(executableGraphNodes), std::move(executableSyncNodesInds)); } void Graph::Init(const std::shared_ptr& model, @@ -346,7 +354,7 @@ static void UseExternalOutputMemory(const std::map& output } void Graph::Activate(const std::vector& externalInputMemory, - const std::vector& externalOutputMemory) { + const std::vector& externalOutputMemory) { OPENVINO_ASSERT(status == Status::Initialized, "Invalid graph status"); const bool hasDynNodes = ProcessDynNodes(); @@ -360,12 +368,13 @@ void Graph::Activate(const std::vector& externalInputMemory, CreatePrimitivesAndExecConstants(); #ifndef CPU_DEBUG_CAPS - for (auto &graphNode : graphNodes) { + for (auto& graphNode : graphNodes) { graphNode->cleanup(); } #endif - std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); + std::tie(m_executableGraphNodes, m_executableSyncNodesInds) = + ExtractExecutableNodesAndSyncPoints(syncNodesInds, graphNodes); if (hasDynNodes) { status = Status::ReadyDynamic; @@ -424,7 +433,7 @@ void Graph::Configure(bool optimize) { void Graph::InitNodes() { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::InitNodes"); - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { node->init(); } } @@ -432,7 +441,7 @@ void Graph::InitNodes() { void Graph::InitDescriptors() { OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "InitDescriptors", "Prepare"); - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, node->profiling.getSupportedDescriptors); DEBUG_LOG("Get supported primitive descriptors for node: ", node->getName()); node->getSupportedDescriptors(); @@ -445,15 +454,15 @@ void Graph::InitDescriptors() { const auto& SPDs = node->getSupportedPrimitiveDescriptors(); for (size_t i = 0; i < SPDs.size(); i++) { DEBUG_LOG("#", - node->getExecIndex(), - " ", - node->getName(), - " Before filter, SupportedPrimitiveDescriptors [", - i, - "/", - SPDs.size(), - "]: \n", - SPDs[i]); + node->getExecIndex(), + " ", + node->getName(), + " Before filter, SupportedPrimitiveDescriptors [", + i, + "/", + SPDs.size(), + "]: \n", + SPDs[i]); } } #endif @@ -478,7 +487,7 @@ void Graph::InitDescriptors() { #endif } - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, node->profiling.selectOptimalPrimitiveDescriptor); DEBUG_LOG("Select optimal primitive descriptors for node: ", node->getName()); node->selectOptimalPrimitiveDescriptor(); @@ -495,12 +504,18 @@ void Graph::ResolveInplaceDirections() { void Graph::InitOptimalPrimitiveDescriptors() { OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Graph::InitOptimalPrimitiveDescriptors"); - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.initOptimalPrimitiveDescriptor); DEBUG_LOG("Init optimal primitive descriptors for node: ", node->getName()); node->initOptimalPrimitiveDescriptor(); - DEBUG_LOG("#", node->getExecIndex(), " ", node->getName(), "\n", - *node->getSelectedPrimitiveDescriptor(), "selectedPrimitiveDescriptorIdx = ", node->selectedPrimitiveDescriptorIndex); + DEBUG_LOG("#", + node->getExecIndex(), + " ", + node->getName(), + "\n", + *node->getSelectedPrimitiveDescriptor(), + "selectedPrimitiveDescriptorIdx = ", + node->selectedPrimitiveDescriptorIndex); } } @@ -508,7 +523,7 @@ void Graph::CreatePrimitivesAndExecConstants() const { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::CreatePrimitivesAndExecConstants"); using shared_memory_ptr = WeightsSharing::SharedMemory::Ptr; - auto acquireSharedOutputs = [this](const NodePtr & node) { + auto acquireSharedOutputs = [this](const NodePtr& node) { std::vector outputs; bool hasLocalAllocatedEdges = false; bool hasExternalInvalidEdges = false; @@ -530,7 +545,7 @@ void Graph::CreatePrimitivesAndExecConstants() const { return std::make_tuple(hasExternalInvalidEdges, hasLocalAllocatedEdges, outputs); }; - for (const auto &node : graphNodes) { + for (const auto& node : graphNodes) { { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.createPrimitive); DEBUG_LOG(*node); @@ -547,7 +562,7 @@ void Graph::CreatePrimitivesAndExecConstants() const { if (std::get<0>(sharedOutputs) || std::get<1>(sharedOutputs)) { ExecuteNodeWithCatch(node); - for (auto & output : std::get<2>(sharedOutputs)) + for (auto& output : std::get<2>(sharedOutputs)) output->valid(true); } } else { @@ -556,7 +571,9 @@ void Graph::CreatePrimitivesAndExecConstants() const { } } -static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDescPtr& childDesc, const dnnl::engine& eng) { +static bool isReorderAvailable(const MemoryDescPtr& parentDesc, + const MemoryDescPtr& childDesc, + const dnnl::engine& eng) { auto definedParentDesc = parentDesc->isDefined() ? parentDesc : MemoryDescUtils::makeDummyDesc(*parentDesc); memory::desc srcMemDesc = MemoryDescUtils::convertToDnnlMemoryDesc(definedParentDesc)->getDnnlDesc(); @@ -566,14 +583,16 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc dnnl::primitive_attr attr; dnnl_primitive_desc_t result = nullptr; - auto status = dnnl_reorder_primitive_desc_create(&result, srcMemDesc.get(), eng.get(), dstMemDesc.get(), eng.get(), + auto status = dnnl_reorder_primitive_desc_create(&result, + srcMemDesc.get(), + eng.get(), + dstMemDesc.get(), + eng.get(), attr.get()); #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) // temporary WA for slow FP32->FP16 conversion reorder in oneDNN on ARM // pretend the reorder is not available to use Convert node instead - if (hasHardwareSupport(ov::element::f16) && - result && - parse_impl_name(result->impl()->name()) == ref_any) { + if (hasHardwareSupport(ov::element::f16) && result && parse_impl_name(result->impl()->name()) == ref_any) { dnnl_primitive_desc_destroy(result); return false; } @@ -587,8 +606,8 @@ static bool isReorderAvailable(const MemoryDescPtr& parentDesc, const MemoryDesc void Graph::insertReorder(EdgePtr& edge, bool isOptimized, std::unordered_set& uniqueLayerNames) { std::string basicLayerName = edge->getParent()->getName() + "_" + - node::Reorder::getReorderArgs(edge->getInputDesc(), edge->getOutputDesc()) + "_" + - edge->getChild()->getName(); + node::Reorder::getReorderArgs(edge->getInputDesc(), edge->getOutputDesc()) + "_" + + edge->getChild()->getName(); std::string layerName = basicLayerName; int idx = 0; while (uniqueLayerNames.find(layerName) != uniqueLayerNames.end()) { @@ -605,11 +624,14 @@ void Graph::insertConvert(EdgePtr& edge) { const auto& inDesc = edge->getInputDesc(); const auto& outDesc = edge->getOutputDesc(); - std::string convertName = edge->getParent()->getName() + "_" + - inDesc.getPrecision().get_type_name() + "_" + outDesc.getPrecision().get_type_name(); + std::string convertName = edge->getParent()->getName() + "_" + inDesc.getPrecision().get_type_name() + "_" + + outDesc.getPrecision().get_type_name(); - auto convertNode = std::make_shared(inDesc.getShape(), inDesc.getPrecision(), outDesc.getPrecision(), - convertName, m_context); + auto convertNode = std::make_shared(inDesc.getShape(), + inDesc.getPrecision(), + outDesc.getPrecision(), + convertName, + m_context); convertNode->setDescs(inDesc, outDesc); InsertNode(edge, convertNode, true); } @@ -720,9 +742,9 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { // Resolve special cases: for (size_t i = 0; i < remaining_edge_clusters_count;) { - auto &cluster = edge_clusters[i]; + auto& cluster = edge_clusters[i]; bool erase = false; - for (auto &edge : cluster) { + for (auto& edge : cluster) { // Remove already allocated edges from the mem reuse algo if (edge->getStatus() == Edge::Status::Allocated) { erase = true; @@ -730,18 +752,23 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } // Special allocation for string tensors - if (edge->getDesc().getPrecision() == element::string && edge->getStatus() == Edge::Status::NeedAllocation) { + if (edge->getDesc().getPrecision() == element::string && + edge->getStatus() == Edge::Status::NeedAllocation) { StringMemory::StringMemoryBlockPtr memBlcok; if (edge->getParent()->isConstant()) { if (edge->getParent()->getType() == Type::Input) { - auto constNode = static_cast(edge->getParent().get()); + auto constNode = static_cast(edge->getParent().get()); edge->reuse(std::const_pointer_cast(constNode->getMemoryPtr())); } else { edge->externalAllocate(m_context->getWeightsCache()); } - auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); - OPENVINO_ASSERT(stringMemory, "[CPU] Edge between nodes '", - edge->getParent()->getName(), "' and '", edge->getChild()->getName(), "' must have StringMemory."); + auto stringMemory = dynamic_cast(edge->getMemoryPtr().get()); + OPENVINO_ASSERT(stringMemory, + "[CPU] Edge between nodes '", + edge->getParent()->getName(), + "' and '", + edge->getChild()->getName(), + "' must have StringMemory."); memBlcok = stringMemory->getStringMemoryBlockPtr(); } else { auto memory = std::make_shared(getEngine(), edge->getDesc()); @@ -752,13 +779,18 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { if (edge_c == edge) { continue; } - OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, "All edges in the cluster must be string."); + OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string, + "All edges in the cluster must be string."); if (edge_c->getStatus() == Edge::Status::NotAllocated) { auto memory = std::make_shared(getEngine(), edge_c->getDesc(), memBlcok); edge_c->reuse(memory); } else { - OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", edge_c->getParent()->getName(), "' and '", - edge_c->getChild()->getName(), "' has an unexpected status: ", static_cast(edge_c->getStatus())); + OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '", + edge_c->getParent()->getName(), + "' and '", + edge_c->getChild()->getName(), + "' has an unexpected status: ", + static_cast(edge_c->getStatus())); } } erase = true; @@ -800,14 +832,15 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { int64_t boxSize = 0; bool isConst = false, isOutput = false, isInput = false; - for (auto &edge : edge_clusters[i]) { + for (auto& edge : edge_clusters[i]) { int e_start = edge->getParent()->getExecIndex(); int e_finish = edge->getChild()->getExecIndex(); auto&& desc = edge->getDesc(); if (boxSize != -1 && desc.isDefined()) { - int64_t e_size = desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) + int64_t e_size = + desc.getCurrentMemSize(); // size in bytes (from the beginning of data to the last element) boxSize = std::max(e_size, boxSize); } else { boxSize = -1; @@ -824,9 +857,9 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } reg.alloc_type = allocType; - isConst |= isConstOutput(edge); + isConst |= isConstOutput(edge); isOutput |= edge->getChild()->getType() == Type::Output; - isInput |= edge->getParent()->getType() == Type::Input; + isInput |= edge->getParent()->getType() == Type::Input; } reg.size = boxSize; @@ -878,7 +911,7 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { memoryRegions.erase(it, memoryRegions.end()); - //Set up the memory control subsystem. + // Set up the memory control subsystem. this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds)); auto memoryBlocks = m_pMemoryControl->insert(memoryRegions); @@ -911,9 +944,8 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { } std::vector edges_to_process; edges_to_process.push_back(edge); - for (auto next_edge = edge->getSharedEdge(std::nothrow); - next_edge; - next_edge = next_edge->getSharedEdge(std::nothrow)) { + for (auto next_edge = edge->getSharedEdge(std::nothrow); next_edge; + next_edge = next_edge->getSharedEdge(std::nothrow)) { edges_to_process.push_back(next_edge); } std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { @@ -937,16 +969,15 @@ void Graph::AllocateWithReuse(const std::vector& syncNodesInds) { void Graph::Allocate(const std::vector& syncNodesInds) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::Allocate"); - //resolve inplace dead end nodes + // resolve inplace dead end nodes for (const auto& edge : graphEdges) { if (edge->getStatus() == Edge::Status::Uninitialized) { if (edge->getParent()->getParentEdges().empty() && - one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && - edge->inPlace(Edge::LOOK_UP)) { + one_of(edge->getParent()->getType(), Type::Input, Type::MemoryInput) && edge->inPlace(Edge::LOOK_UP)) { edge->getParent()->resolveInPlaceEdges(Edge::LOOK_UP); } else if (edge->getChild()->getChildEdges().empty() && - one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) && - edge->inPlace(Edge::LOOK_DOWN)) { + one_of(edge->getChild()->getType(), Type::Output, Type::MemoryOutput) && + edge->inPlace(Edge::LOOK_DOWN)) { edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); } } @@ -955,13 +986,15 @@ void Graph::Allocate(const std::vector& syncNodesInds) { // resolve edges. Define which will be a view on others // NeedAllocation - real blob // NotAllocated - view on other blob, peer or in-place - for (auto& edge : graphEdges) edge->init(); + for (auto& edge : graphEdges) + edge->init(); // Allocate memory space for all edges marked with NeedAllocation AllocateWithReuse(syncNodesInds); // Check all getters. Should work. - for (auto& edge : graphEdges) edge->validate(); + for (auto& edge : graphEdges) + edge->validate(); } bool Graph::ProcessDynNodes() { @@ -975,7 +1008,8 @@ bool Graph::ProcessDynNodes() { } void Graph::PushInputData(const std::size_t& index, const ov::SoPtr& input) { - if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready."); + if (!IsReady()) + OPENVINO_THROW("Wrong state. Topology not ready."); auto input_itr = inputNodesMap.find(index); if (input_itr != inputNodesMap.end()) { auto node = input_itr->second; @@ -1010,7 +1044,7 @@ void Graph::PullOutputData(std::unordered_map>& if (!IsReady()) OPENVINO_THROW("Wrong state. Topology not ready."); - for (auto &outputMap : outputNodesMap) { + for (auto& outputMap : outputNodesMap) { auto output_index = outputMap.first; auto node = outputMap.second; auto parentEdge = node->getParentEdgeAt(0); @@ -1040,17 +1074,32 @@ void Graph::PullOutputData(std::unordered_map>& if (ext_blob->get_shape() != outDims && !isScalarOutput) { // WA: because input/output info initially contains non empty dims, order etc. // and setDims (called inside setShape) can't correct modify blocked desc for desc with blocked layout - DEBUG_LOG(output_index, ", tensor data addr ", static_cast(output[output_index]->data()), - " dims ", PartialShape(output[output_index]->get_shape()), " -> ", PartialShape(outDims), - ", intr ptr ", intr_blob.getData(), " , parentedge's memory object ", parentEdge->getMemoryPtr().get()); + DEBUG_LOG(output_index, + ", tensor data addr ", + static_cast(output[output_index]->data()), + " dims ", + PartialShape(output[output_index]->get_shape()), + " -> ", + PartialShape(outDims), + ", intr ptr ", + intr_blob.getData(), + " , parentedge's memory object ", + parentEdge->getMemoryPtr().get()); ext_blob->set_shape(outDims); - DEBUG_LOG(output_index, ", tensor data addr ", static_cast(output[output_index]->data()), - " dims ", PartialShape(output[output_index]->get_shape()), ", intr ptr ", intr_blob.getData()); + DEBUG_LOG(output_index, + ", tensor data addr ", + static_cast(output[output_index]->data()), + " dims ", + PartialShape(output[output_index]->get_shape()), + ", intr ptr ", + intr_blob.getData()); expected_desc_ptr = MemoryDescUtils::generateCpuBlockedMemoryDesc(ext_blob); } // check for empty output blob - if (std::any_of(outDims.begin(), outDims.end(), [](const Dim dim) {return dim == 0;})) { + if (std::any_of(outDims.begin(), outDims.end(), [](const Dim dim) { + return dim == 0; + })) { continue; } @@ -1063,12 +1112,22 @@ void Graph::PullOutputData(std::unordered_map>& intr_blob.getSize(), ")."); - void *ext_blob_ptr = ext_blob->data(); - void *intr_blob_ptr = intr_blob.getData(); - DEBUG_LOG(output_index, " @ ", intr_blob_ptr, " -> ", ext_blob_ptr, " zero-copy: ", intr_blob_ptr == ext_blob_ptr, " graph ", this, "\r\n"); + void* ext_blob_ptr = ext_blob->data(); + void* intr_blob_ptr = intr_blob.getData(); + DEBUG_LOG(output_index, + " @ ", + intr_blob_ptr, + " -> ", + ext_blob_ptr, + " zero-copy: ", + intr_blob_ptr == ext_blob_ptr, + " graph ", + this, + "\r\n"); // That is the same memory. No need to copy - if (ext_blob_ptr == intr_blob_ptr) continue; + if (ext_blob_ptr == intr_blob_ptr) + continue; if (actualDesc->getPrecision() == element::string) { StringMemory outBloMem(getEngine(), expected_desc_ptr, ext_blob_ptr); @@ -1077,7 +1136,10 @@ void Graph::PullOutputData(std::unordered_map>& Memory outBloMem(getEngine(), expected_desc_ptr, ext_blob_ptr, false); outBloMem.load(intr_blob, false); } else { - OPENVINO_ASSERT(srcPrec == dstPrec, "The precision of the CPU output tensor index", output_index, " is different from the external one"); + OPENVINO_ASSERT(srcPrec == dstPrec, + "The precision of the CPU output tensor index", + output_index, + " is different from the external one"); size_t size_to_copy = intr_blob.getSize(); cpu_parallel_memcpy(ext_blob_ptr, intr_blob_ptr, size_to_copy); } @@ -1108,7 +1170,8 @@ namespace { class UpdateNodesSeq { public: - explicit UpdateNodesSeq(std::vector& executableGraphNodes) : m_executableGraphNodes(executableGraphNodes) {} + explicit UpdateNodesSeq(std::vector& executableGraphNodes) + : m_executableGraphNodes(executableGraphNodes) {} void operator()(size_t stopIndx) { for (; prepareCounter < stopIndx; ++prepareCounter) { @@ -1126,7 +1189,7 @@ class UpdateNodesSeq { }; #if (OV_THREAD == OV_THREAD_SEQ) - using UpdateNodes = UpdateNodesSeq; +using UpdateNodes = UpdateNodesSeq; #endif #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO || OV_THREAD == OV_THREAD_OMP) @@ -1143,7 +1206,8 @@ class UpdateNodesSeq { class UpdateNodesBase { public: - explicit UpdateNodesBase(std::vector& executableGraphNodes) : m_executableGraphNodes(executableGraphNodes) {} + explicit UpdateNodesBase(std::vector& executableGraphNodes) + : m_executableGraphNodes(executableGraphNodes) {} void updateShapes(size_t node_indx, size_t stop_indx) { try { for (size_t i = node_indx; i < stop_indx; i++) { @@ -1153,8 +1217,7 @@ class UpdateNodesBase { } m_prepareCounter.store(i, ov_memory_order_release); } - } - catch(...) { + } catch (...) { m_completion.store(true, ov_memory_order_relaxed); throw; } @@ -1185,13 +1248,16 @@ class UpdateNodesBase { std::vector& m_executableGraphNodes; }; -#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) -#if (TBB_VERSION_MAJOR > 2020) +# if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +# if (TBB_VERSION_MAJOR > 2020) template class AsyncTask : public tbb::detail::d1::task { public: - AsyncTask(Body& body, tbb::detail::d1::wait_context& wait, size_t node_indx, size_t stop_indx) : - m_body(body), m_wait(wait), m_node_indx(node_indx), m_stop_indx(stop_indx) {} + AsyncTask(Body& body, tbb::detail::d1::wait_context& wait, size_t node_indx, size_t stop_indx) + : m_body(body), + m_wait(wait), + m_node_indx(node_indx), + m_stop_indx(stop_indx) {} task* execute(tbb::detail::d1::execution_data&) override { m_body(m_node_indx, m_stop_indx); m_wait.release(); @@ -1235,11 +1301,14 @@ class UpdateNodes : public UpdateNodesBase { private: tbb::task_group_context ctx; }; -#else +# else template class AsyncTask : public tbb::task { public: - AsyncTask(Body& body, size_t node_indx, size_t stop_indx) : m_body(body), m_node_indx(node_indx), m_stop_indx(stop_indx) {} + AsyncTask(Body& body, size_t node_indx, size_t stop_indx) + : m_body(body), + m_node_indx(node_indx), + m_stop_indx(stop_indx) {} task* execute() override { m_body(m_node_indx, m_stop_indx); return nullptr; @@ -1257,28 +1326,30 @@ class UpdateNodes : public UpdateNodesBase { void operator()(size_t stopIndx) { m_completion.store(false); auto startCounter = m_prepareCounter.load(); - tbb::task& root = *new(tbb::task::allocate_root()) tbb::empty_task; - root.set_ref_count(3); // two for children and one preserved + tbb::task& root = *new (tbb::task::allocate_root()) tbb::empty_task; + root.set_ref_count(3); // two for children and one preserved auto task1 = [this](size_t start, size_t stop) { this->updateShapes(start, stop); }; - AsyncTask& a = *new (root.allocate_child()) AsyncTask(task1, startCounter, stopIndx); + AsyncTask& a = + *new (root.allocate_child()) AsyncTask(task1, startCounter, stopIndx); auto task2 = [this](size_t start, size_t stop) { this->updateDynParams(start, stop); }; - AsyncTask& b = *new (root.allocate_child()) AsyncTask(task2, startCounter, stopIndx); + AsyncTask& b = + *new (root.allocate_child()) AsyncTask(task2, startCounter, stopIndx); - b.set_affinity(2); // slot 1 plus 1 + b.set_affinity(2); // slot 1 plus 1 tbb::task::spawn(b); root.spawn_and_wait_for_all(a); } }; -#endif -#endif +# endif +# endif -#if (OV_THREAD == OV_THREAD_OMP) +# if (OV_THREAD == OV_THREAD_OMP) class UpdateNodes : public UpdateNodesBase { public: using UpdateNodesBase::UpdateNodesBase; @@ -1293,14 +1364,15 @@ class UpdateNodes : public UpdateNodesBase { if (origin_nested_levels < 2) { set_max_nested_levels(2); } - // In OpenMP, an exception that is thrown in a parallel region must be caught and handled in the same region by the same thread. - // Therefore, need to pass the error message and throw a new exception outside the parallel region. + // In OpenMP, an exception that is thrown in a parallel region must be caught and handled in the same region by + // the same thread. Therefore, need to pass the error message and throw a new exception outside the parallel + // region. const char* what = nullptr; - #pragma omp parallel - #pragma omp sections +# pragma omp parallel +# pragma omp sections { - #pragma omp section +# pragma omp section { try { updateDynParams(startCounter, stopIndx); @@ -1310,7 +1382,7 @@ class UpdateNodes : public UpdateNodesBase { what = "[ CPU ] Could not update dynamic parameters."; } } - #pragma omp section +# pragma omp section { try { updateShapes(startCounter, stopIndx); @@ -1329,18 +1401,18 @@ class UpdateNodes : public UpdateNodesBase { OPENVINO_ASSERT(what == nullptr, what); } }; -#endif +# endif #endif -} // namespace +} // namespace /* group all the profiling macros into a single one * to avoid cluttering a core logic */ #define VERBOSE_PERF_DUMP_ITT_DEBUG_LOG(ittScope, node, config) \ - VERBOSE(node, config.debugCaps.verbose); \ - PERF(node, config.collectPerfCounters); \ - DUMP(node, config.debugCaps, infer_count); \ - OV_ITT_SCOPED_TASK(ittScope, node->profiling.execute); \ + VERBOSE(node, config.debugCaps.verbose); \ + PERF(node, config.collectPerfCounters); \ + DUMP(node, config.debugCaps, infer_count); \ + OV_ITT_SCOPED_TASK(ittScope, node->profiling.execute); \ DEBUG_LOG(*node); inline void Graph::ExecuteNode(const NodePtr& node, SyncInferRequest* request, int numaId) const { @@ -1362,7 +1434,7 @@ inline void Graph::ExecuteNodeWithCatch(const NodePtr& node, SyncInferRequest* r } } -template +template void Graph::InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update) { size_t inferCounter = 0; for (auto stopIndx : m_executableSyncNodesInds) { @@ -1410,17 +1482,20 @@ void Graph::Infer(SyncInferRequest* request) { InferStatic(request, numaId); break; default: - OPENVINO_ASSERT(IsReady(), "Wrong state of the ov::intel_cpu::Graph. Topology is not ready: ", static_cast(status)); + OPENVINO_ASSERT(IsReady(), + "Wrong state of the ov::intel_cpu::Graph. Topology is not ready: ", + static_cast(status)); } - if (infer_count != -1) infer_count++; + if (infer_count != -1) + infer_count++; } void Graph::SortTopologically() { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::SortTopologically"); // Set execIndex of all nodes to default invaild value - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { node->execIndex = -1; } @@ -1433,7 +1508,7 @@ void Graph::SortTopologically() { std::function visit; visit = [&execIndexCnt, &sorted, &visit](const NodePtr node) { if (node->execIndex >= 0) - return; // already visited + return; // already visited for (size_t i = 0; i < node->getParentEdges().size(); i++) { visit(node->getParentEdgeAt(i)->getParent()); @@ -1467,7 +1542,7 @@ void Graph::SortTopologically() { // Sort in / out child edges by port index // Make first N (N == port_num) edge indexes match with port index - for (auto &node : graphNodes) { + for (auto& node : graphNodes) { int port_num = node->outputShapes.size(); std::vector res(port_num); @@ -1512,10 +1587,7 @@ void Graph::GetPerfData(std::vector& perfMap) const { } } -void Graph::CreateEdge(const NodePtr& parent, - const NodePtr& child, - int parentPort, - int childPort) { +void Graph::CreateEdge(const NodePtr& parent, const NodePtr& child, int parentPort, int childPort) { assert(parentPort >= 0 && childPort >= 0); auto edge = std::make_shared(parent, child, parentPort, childPort); @@ -1539,24 +1611,28 @@ void Graph::AddNode(NodePtr node) { graphNodes.push_back(node); } -void Graph::DropNode(const NodePtr &node) { +void Graph::DropNode(const NodePtr& node) { auto children = node->childEdges; auto parents = node->parentEdges; for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; const int inNum = p_edge->getInputNum(); RemoveEdge(p_edge); for (size_t j = 0; j < children.size(); j++) { auto c_edge = children[j].lock(); - if (!c_edge) continue; + if (!c_edge) + continue; auto child = c_edge->getChild(); - if (!child) continue; + if (!child) + continue; const int outNum = c_edge->getOutputNum(); RemoveEdge(c_edge); @@ -1565,31 +1641,37 @@ void Graph::DropNode(const NodePtr &node) { } } -void Graph::DropDWConvNode(const NodePtr &node) { +void Graph::DropDWConvNode(const NodePtr& node) { auto children = node->childEdges; auto parents = node->parentEdges; auto parentConvEdge = parents[0].lock(); - if (!parentConvEdge) return; + if (!parentConvEdge) + return; auto parentConv = parentConvEdge->getParent(); - if (!parentConv) return; + if (!parentConv) + return; parentConv->outputShapes[0] = node->outputShapes[0]; for (size_t i = 0; i < 1; i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; const int inNum = p_edge->getInputNum(); RemoveEdge(p_edge); for (size_t j = 0; j < children.size(); j++) { auto c_edge = children[j].lock(); - if (!c_edge) continue; + if (!c_edge) + continue; auto child = c_edge->getChild(); - if (!child) continue; + if (!child) + continue; const int outNum = c_edge->getOutputNum(); RemoveEdge(c_edge); @@ -1599,9 +1681,11 @@ void Graph::DropDWConvNode(const NodePtr &node) { for (size_t i = 1; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; const int inNum = p_edge->getInputNum(); const int portCandidate = p_edge->getOutputNum(); @@ -1615,14 +1699,20 @@ void Graph::DropDWConvNode(const NodePtr &node) { } void Graph::RemoveDroppedNodes() { - graphNodes.erase(std::remove_if(graphNodes.begin(), graphNodes.end(), - [](const NodePtr& node){ return node->isDropped(); }), + graphNodes.erase(std::remove_if(graphNodes.begin(), + graphNodes.end(), + [](const NodePtr& node) { + return node->isDropped(); + }), graphNodes.end()); } void Graph::RemoveDroppedEdges() { - graphEdges.erase(std::remove_if(graphEdges.begin(), graphEdges.end(), - [](const EdgePtr& node){ return node->isDropped(); }), + graphEdges.erase(std::remove_if(graphEdges.begin(), + graphEdges.end(), + [](const EdgePtr& node) { + return node->isDropped(); + }), graphEdges.end()); } @@ -1631,19 +1721,28 @@ NodePtr Graph::InsertReorder(EdgePtr edge, const MemoryDesc& inDesc, const MemoryDesc& outDesc, bool isOptimized, - const std::vector & src_perm) { + const std::vector& src_perm) { auto reorder = std::make_shared(inDesc, outDesc, layerName, m_context); reorder->setOptimized(isOptimized); reorder->setSrcPermutation(src_perm); DEBUG_LOG(reorder->getName(), " edge=", *edge, " isOptimized=", isOptimized); - DEBUG_LOG(" inDesc: ", inDesc.getShape().toString(), inDesc.getPrecision().get_type_name(), " ", inDesc.serializeFormat()); - DEBUG_LOG(" outDesc: ", outDesc.getShape().toString(), outDesc.getPrecision().get_type_name(), " ", outDesc.serializeFormat()); + DEBUG_LOG(" inDesc: ", + inDesc.getShape().toString(), + inDesc.getPrecision().get_type_name(), + " ", + inDesc.serializeFormat()); + DEBUG_LOG(" outDesc: ", + outDesc.getShape().toString(), + outDesc.getPrecision().get_type_name(), + " ", + outDesc.serializeFormat()); InsertNode(edge, reorder, true); // Using the method Edge::getDesc() we can check that input and output tensor descriptors are equal. - // Due to the specificity of GraphOptimizer::MergeTransposeAndReorder() that isOptimized flag uses, we shouldn't do these checks. + // Due to the specificity of GraphOptimizer::MergeTransposeAndReorder() that isOptimized flag uses, we shouldn't do + // these checks. if (!isOptimized) { reorder->getParentEdgeAt(0)->getDesc(); reorder->getChildEdgeAt(0)->getDesc(); @@ -1692,10 +1791,10 @@ void Graph::EnforceInferencePrecision() { const auto inferPrec = getConfig().inferencePrecision; if (one_of(inferPrec, element::f32, element::undefined, ov::element::f16)) - return; // nothing to do, only precision reduction is currently allowed + return; // nothing to do, only precision reduction is currently allowed #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) if (inferPrec == ov::element::f16) - return; // precision of configured by ov::pass::ConvertPrecision + return; // precision of configured by ov::pass::ConvertPrecision #endif std::function& skipNodes)> searchForNodesToSkip; searchForNodesToSkip = [&](const NodePtr& node, std::unordered_set& skipNodes) -> void { @@ -1703,35 +1802,35 @@ void Graph::EnforceInferencePrecision() { const auto& parent = node->getParentEdgeAt(i)->getParent(); if (inferPrec == ov::element::bf16) { /* list of node types that must be forced to be executed in BF16 precision - * because of performance gains */ + * because of performance gains */ if (one_of(parent->getType(), - Type::Convolution, // conv nets - Type::FullyConnected, // conv / bert nets - Type::RNNCell, // recurent nets - Type::RNNSeq, // recurent nets - Type::MatMul, // bert nets - Type::ROIPooling, // object detection nets - Type::Interpolate, // super resolution nets - Type::PagedAttention, // page attention - Type::QKVProjection, - Type::LLMMLP)) - continue; // stop at significant nodes + Type::Convolution, // conv nets + Type::FullyConnected, // conv / bert nets + Type::RNNCell, // recurent nets + Type::RNNSeq, // recurent nets + Type::MatMul, // bert nets + Type::ROIPooling, // object detection nets + Type::Interpolate, // super resolution nets + Type::PagedAttention, // page attention + Type::QKVProjection, + Type::LLMMLP)) + continue; // stop at significant nodes } else if (inferPrec == ov::element::f16) { /* list of node types that must be forced to be executed in FP16 precision - * because of performance gains */ + * because of performance gains */ if (one_of(parent->getType(), - Type::Convolution, // conv nets - Type::Deconvolution, // deconv - Type::FullyConnected, // conv / bert nets - Type::MatMul, // bert nets - Type::Pooling, - Type::MVN)) - continue; // stop at significant nodes + Type::Convolution, // conv nets + Type::Deconvolution, // deconv + Type::FullyConnected, // conv / bert nets + Type::MatMul, // bert nets + Type::Pooling, + Type::MVN)) + continue; // stop at significant nodes } const auto res = skipNodes.insert(parent); - if (res.second) // node not visited yet + if (res.second) // node not visited yet searchForNodesToSkip(parent, skipNodes); } }; @@ -1772,10 +1871,10 @@ void Graph::EnforceInferencePrecision() { // kvcache of PagedAttention should be written directly if (node->getType() == Type::PagedAttention && (inPort == 3 || inPort == 4)) return true; - const auto &parent = node->getParentEdgeAt(inPort)->getParent(); + const auto& parent = node->getParentEdgeAt(inPort)->getParent(); /* Skip BF16 enforcement for nodes after Constant Inputs for maintaining precision for fusing. - * Element type conversion to bf16 is done automatically, if convolution follows up after Constant Inputs - * and activation is bf16 */ + * Element type conversion to bf16 is done automatically, if convolution follows up after Constant + * Inputs and activation is bf16 */ if (parent->getType() == Type::Input && parent->isConstant() && // Concatenation node is exception because it doesn't change an accuracy for BF16 activation node->getType() != Type::Concatenation) @@ -1815,7 +1914,7 @@ void Graph::EnforceInferencePrecision() { // exclude Convert before Range since it may cause precision loss when integter type to LP. // TODO: Incorrect subgraph is generated by ONNX FE + ticket 117861. - const auto &child = node->getChildEdgeAt(i)->getChild(); + const auto& child = node->getChildEdgeAt(i)->getChild(); if (child->getType() == Type::Range && node->getType() == Type::Convert) continue; // skip second output of PagedAttention @@ -1845,5 +1944,5 @@ const std::unordered_map& Graph::getInterna return m_context->getMemoryStatesRegister()->getMemoryStates(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index d50ccc152c9186..bdf3205d2baaaf 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -4,22 +4,20 @@ #pragma once +#include +#include +#include +#include + #include "config.h" #include "cpu_memory.h" -#include "nodes/input.h" -#include "openvino/core/node_vector.hpp" -#include "openvino/runtime/profiling_info.hpp" -#include "node.h" #include "edge.h" #include "graph_context.h" #include "memory_control.hpp" +#include "node.h" +#include "nodes/input.h" +#include "openvino/core/node_vector.hpp" #include "openvino/runtime/profiling_info.hpp" - -#include -#include -#include -#include - #include "openvino/runtime/so_ptr.hpp" #include "proxy_mem_blk.h" @@ -29,7 +27,7 @@ namespace intel_cpu { class SyncInferRequest; namespace node { class MemoryStateNode; -} // namespace node +} // namespace node class Graph { public: @@ -61,15 +59,15 @@ class Graph { return IsStatic() || IsDynamic(); } - const Config & getConfig() const { + const Config& getConfig() const { return m_context->getConfig(); } - template - void CreateGraph(NET &model, const GraphContext::CPtr context); + template + void CreateGraph(NET& model, const GraphContext::CPtr context); - void CreateGraph(const std::vector &graphNodes, - const std::vector &graphEdges, + void CreateGraph(const std::vector& graphNodes, + const std::vector& graphEdges, const GraphContext::CPtr context, std::string name); @@ -97,14 +95,14 @@ class Graph { return outputNodesMap; } - NodePtr getInputNodeByIndex(const std::size_t &index) { + NodePtr getInputNodeByIndex(const std::size_t& index) { auto input = inputNodesMap.find(index); if (input == inputNodesMap.end()) OPENVINO_THROW("CPU execution graph doesn't contain input node with index: ", index); return input->second; } - NodePtr getOutputNodeByIndex(const std::size_t &index) { + NodePtr getOutputNodeByIndex(const std::size_t& index) { auto output = outputNodesMap.find(index); if (output == outputNodesMap.end()) OPENVINO_THROW("CPU execution graph doesn't contain output node with index: ", index); @@ -119,12 +117,9 @@ class Graph { return m_context; } - void GetPerfData(std::vector &perfMap) const; + void GetPerfData(std::vector& perfMap) const; - void CreateEdge(const NodePtr& parent, - const NodePtr& child, - int parentPort = 0, - int childPort = 0); + void CreateEdge(const NodePtr& parent, const NodePtr& child, int parentPort = 0, int childPort = 0); void RemoveEdge(const EdgePtr& edge); void RemoveDroppedNodes(); void RemoveDroppedEdges(); @@ -134,9 +129,9 @@ class Graph { /** * @brief Insert Reorder node at the edge-specified location. - * The Reorder node must be inserted in case when there are inplace conflicts or the input and output tensor descriptors do not match. - * The Reorder node rearranges the elements in memory according to inDesc and outDesc, or reinterprets memory descriptor without - * rearrangement of elements if isOptimized is true. + * The Reorder node must be inserted in case when there are inplace conflicts or the input and output tensor + * descriptors do not match. The Reorder node rearranges the elements in memory according to inDesc and outDesc, or + * reinterprets memory descriptor without rearrangement of elements if isOptimized is true. * @param edge * pointer to the edge in the graph where Reorder node will be inserted * @param layerName @@ -153,14 +148,18 @@ class Graph { * pointer to the blob containing scales * @return pointer to the new Reorder node. */ - NodePtr InsertReorder(EdgePtr edge, std::string layerName, const MemoryDesc& inDesc, - const MemoryDesc& outDesc, bool isOptimized = false, const std::vector & src_perm = {}); + NodePtr InsertReorder(EdgePtr edge, + std::string layerName, + const MemoryDesc& inDesc, + const MemoryDesc& outDesc, + bool isOptimized = false, + const std::vector& src_perm = {}); /** * @brief Insert Node at the edge-specified location. - * This method supports two regimes. First, the node is inserted without initialization (i.e. supported descriptors initialization, - * supported primitive descriptors selection, etc.), which can be useful after the ResolveEdgeConflicts() completes. The second is just inserting the - * node without initialization. + * This method supports two regimes. First, the node is inserted without initialization (i.e. supported descriptors + * initialization, supported primitive descriptors selection, etc.), which can be useful after the + * ResolveEdgeConflicts() completes. The second is just inserting the node without initialization. * @param edge * pointer to the edge in the graph where the node will be inserted * @param node @@ -173,10 +172,10 @@ class Graph { /** * @brief Insert Node between two specified nodes. - * This procedure creates two edges that link the parent and child nodes to the inserted one and adds all created objects to the graph. - * This method supports two regimes. First, the node is inserted without initialization (i.e. supported descriptors initialization, - * supported primitive descriptors selection, etc.), which can be useful after the ResolveEdgeConflicts() completes. The second is just inserting the - * node without initialization. + * This procedure creates two edges that link the parent and child nodes to the inserted one and adds all created + * objects to the graph. This method supports two regimes. First, the node is inserted without initialization (i.e. + * supported descriptors initialization, supported primitive descriptors selection, etc.), which can be useful after + * the ResolveEdgeConflicts() completes. The second is just inserting the node without initialization. * @param parent * pointer to the parent node * @param child @@ -193,7 +192,9 @@ class Graph { std::shared_ptr dump() const; - void ResetInferCount() { infer_count = 0; } + void ResetInferCount() { + infer_count = 0; + } void SortTopologically(); @@ -215,7 +216,7 @@ class Graph { * Activate execution graph using \p externalInputMemory and \p externalOutputMemory */ void Activate(const std::vector& externalInputMemory = {}, - const std::vector& externalOutputMemory = {}); + const std::vector& externalOutputMemory = {}); const std::unordered_map& getOutputNodesMemBlocksMap() const { return outputNodesMemBlocksMap; @@ -231,7 +232,7 @@ class Graph { graphEdges.clear(); m_executableSyncNodesInds.clear(); } - Status status { Status::NotReady }; + Status status{Status::NotReady}; // For dumping purposes. -1 - no counting, all other positive // values mean increment it within each Infer() call @@ -244,7 +245,7 @@ class Graph { bool graphHasDynamicInput = false; - void Replicate(const std::shared_ptr &subgraph, + void Replicate(const std::shared_ptr& subgraph, const std::vector& inputConfigs = {}, const std::vector& outputConfigs = {}); @@ -281,10 +282,10 @@ class Graph { void ExecuteNode(const NodePtr& node, SyncInferRequest* request = nullptr, int numaId = -1) const; void InferStatic(SyncInferRequest* request, int numaId); - template + template void InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update); - friend std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph); + friend std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph& graph); private: using event_t = void (Graph::*)(void); diff --git a/src/plugins/intel_cpu/src/graph_context.cpp b/src/plugins/intel_cpu/src/graph_context.cpp index e200766fa4791c..462cdab2a9b5c0 100644 --- a/src/plugins/intel_cpu/src/graph_context.cpp +++ b/src/plugins/intel_cpu/src/graph_context.cpp @@ -1,10 +1,11 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "dnnl_types.h" #include "graph_context.h" -#include "nodes/memory.hpp" + +#include "dnnl_types.h" #include "memory_control.hpp" +#include "nodes/memory.hpp" namespace ov { namespace intel_cpu { @@ -27,6 +28,7 @@ GraphContext::GraphContext(const Config& config, numNumaNodes = 1; if (streamExecutor) { cpuStreamExecutor = std::dynamic_pointer_cast(streamExecutor); + numaNodeId = cpuStreamExecutor ? cpuStreamExecutor->get_numa_node_id() : 0; auto nNumaNodes = get_num_numa_nodes(); if (numNumaNodes < nNumaNodes) numNumaNodes = nNumaNodes; @@ -41,5 +43,5 @@ const dnnl::engine& GraphContext::getEngine() { return eng; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_context.h b/src/plugins/intel_cpu/src/graph_context.h index db2b126213978c..d13872129325b4 100644 --- a/src/plugins/intel_cpu/src/graph_context.h +++ b/src/plugins/intel_cpu/src/graph_context.h @@ -4,11 +4,11 @@ #pragma once -#include "openvino/runtime/threading/cpu_streams_executor.hpp" -#include "sub_memory_manager.hpp" #include "cache/multi_cache.h" #include "config.h" #include "dnnl_scratch_pad.h" +#include "openvino/runtime/threading/cpu_streams_executor.hpp" +#include "sub_memory_manager.hpp" #include "weights_cache.hpp" namespace ov { @@ -16,7 +16,7 @@ namespace intel_cpu { namespace node { class MemoryStatesRegister; -} // namespace node +} // namespace node class NetworkMemoryControl; @@ -39,17 +39,12 @@ class GraphContext { return weightsCache; } - MultiCachePtr getParamsCache() const { return rtParamsCache; } - DnnlScratchPadPtr getScratchPad(int subStreamID = 0) const { - if (subStreamID < 0) - subStreamID = 0; - if (subStreamID >= numNumaNodes - 1) - subStreamID = numNumaNodes - 1; - return rtScratchPads[subStreamID]; + DnnlScratchPadPtr getScratchPad() const { + return rtScratchPads[numaNodeId]; } const std::vector& getScratchPads() const { @@ -85,7 +80,7 @@ class GraphContext { private: Config config; // network-level config - WeightsSharing::Ptr weightsCache; // per NUMA node caches for sharing weights data + WeightsSharing::Ptr weightsCache; // per NUMA node caches for sharing weights data MultiCachePtr rtParamsCache; // primitive cache DnnlScratchPadPtr rtScratchPad; // scratch pad @@ -94,13 +89,14 @@ class GraphContext { std::vector rtScratchPads; // scratch pad (each sub-stream has its own copy) - ov::threading::IStreamsExecutor::Ptr streamExecutor; // stream executor for current graph + ov::threading::IStreamsExecutor::Ptr streamExecutor; // stream executor for current graph - ov::threading::CPUStreamsExecutor::Ptr cpuStreamExecutor; // cpu stream executor for current graph + ov::threading::CPUStreamsExecutor::Ptr cpuStreamExecutor; // cpu stream executor for current graph std::shared_ptr subMemoryManager; int numNumaNodes = 1; + int numaNodeId = 0; std::shared_ptr memoryStatesRegister; std::shared_ptr networkMemoryControl; diff --git a/src/plugins/intel_cpu/src/graph_dumper.cpp b/src/plugins/intel_cpu/src/graph_dumper.cpp index 04c15408743c71..5a3a95362267fe 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.cpp +++ b/src/plugins/intel_cpu/src/graph_dumper.cpp @@ -4,28 +4,28 @@ #include "graph_dumper.h" -#include "dnnl_debug.h" -#include "openvino/pass/manager.hpp" -#include "openvino/pass/serialize.hpp" -#include "openvino/runtime/exec_model_info.hpp" -#include "utils/debug_capabilities.h" - #include +#include #include #include #include #include -#include + +#include "dnnl_debug.h" +#include "openvino/pass/manager.hpp" +#include "openvino/pass/serialize.hpp" +#include "openvino/runtime/exec_model_info.hpp" +#include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { -void serializeToCout(const Graph &graph); -void serializeToXML(const Graph &graph, const std::string& path); +void serializeToCout(const Graph& graph); +void serializeToXML(const Graph& graph, const std::string& path); namespace { -std::map extract_node_metadata(const NodePtr &node) { +std::map extract_node_metadata(const NodePtr& node) { std::map serialization_info; if (node->getType() == Type::Input && node->isConstant()) { @@ -47,7 +47,8 @@ std::map extract_node_metadata(const NodePtr &node) { bool isAllEqual = true; for (size_t i = 1; i < node->getChildEdges().size(); i++) { - if (node->getChildEdgeAt(i - 1)->getMemory().getDesc().getPrecision() != node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision()) { + if (node->getChildEdgeAt(i - 1)->getMemory().getDesc().getPrecision() != + node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision()) { isAllEqual = false; break; } @@ -56,7 +57,8 @@ std::map extract_node_metadata(const NodePtr &node) { // If all output precisions are the same, we store the name only once if (!isAllEqual) { for (size_t i = 1; i < node->getChildEdges().size(); i++) - outputPrecisionsStr += "," + std::string(node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision().get_type_name()); + outputPrecisionsStr += + "," + std::string(node->getChildEdgeAt(i)->getMemory().getDesc().getPrecision().get_type_name()); } } else { // Branch to correctly handle output nodes @@ -107,8 +109,8 @@ std::map extract_node_metadata(const NodePtr &node) { } // namespace -std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { - std::map > node2layer; +std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph& graph) { + std::map> node2layer; ov::ResultVector results; ov::ParameterVector params; @@ -117,7 +119,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { std::map> paramsMap; std::map> resultsMap; - auto get_inputs = [&] (const NodePtr & node) { + auto get_inputs = [&](const NodePtr& node) { auto pr_edges = node->getParentEdges(); ov::OutputVector inputs(pr_edges.size()); @@ -136,10 +138,10 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { return inputs; }; - auto create_ngraph_node = [&](const NodePtr &node) { + auto create_ngraph_node = [&](const NodePtr& node) { bool is_input = false, is_output = false, should_be_hold = false; size_t input_index = -1, output_index = -1; - for (auto && kvp : graph.inputNodesMap) { + for (auto&& kvp : graph.inputNodesMap) { if (kvp.second == node) { is_input = true; input_index = kvp.first; @@ -147,7 +149,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { } } - for (auto && kvp : graph.outputNodesMap) { + for (auto&& kvp : graph.outputNodesMap) { if (kvp.second == node) { is_output = true; output_index = kvp.first; @@ -174,7 +176,8 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { return_node = result; } else { return_node = std::make_shared( - get_inputs(node), node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size()); + get_inputs(node), + node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size()); for (size_t port = 0; port < return_node->get_output_size(); ++port) { auto& desc = node->getChildEdgeAt(port)->getMemory().getDesc(); @@ -186,7 +189,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { to_hold.push_back(return_node); } - for (auto && kvp : meta_data) + for (auto&& kvp : meta_data) return_node->get_rt_info()[kvp.first] = kvp.second; return_node->set_friendly_name(node->getName()); @@ -195,18 +198,18 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { ov::NodeVector nodes; nodes.reserve(graph.graphNodes.size()); - for (auto &node : graph.graphNodes) { // important: graph.graphNodes are in topological order + for (auto& node : graph.graphNodes) { // important: graph.graphNodes are in topological order nodes.emplace_back(create_ngraph_node(node)); node2layer[node] = nodes.back(); } - for (auto && kvp : paramsMap) + for (auto&& kvp : paramsMap) params.push_back(kvp.second); - for (auto && kvp : resultsMap) + for (auto&& kvp : resultsMap) results.push_back(kvp.second); auto holder = !results.empty() ? results[0] : std::make_shared(); - for (auto &node : to_hold) { + for (auto& node : to_hold) { holder->add_control_dependency(node); } @@ -214,7 +217,7 @@ std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph) { } #ifdef CPU_DEBUG_CAPS -void serialize(const Graph &graph) { +void serialize(const Graph& graph) { const std::string& path = graph.getConfig().debugCaps.execGraphPath; if (path.empty()) @@ -231,19 +234,17 @@ void serialize(const Graph &graph) { } } -void serializeToXML(const Graph &graph, const std::string& path) { +void serializeToXML(const Graph& graph, const std::string& path) { if (path.empty()) return; std::string binPath; ov::pass::Manager manager; - manager.register_pass(path, - binPath, - ov::pass::Serialize::Version::IR_V10); + manager.register_pass(path, binPath, ov::pass::Serialize::Version::IR_V10); manager.run_passes(graph.dump()); } -void serializeToCout(const Graph &graph) { +void serializeToCout(const Graph& graph) { for (const auto& node : graph.GetNodes()) { std::cout << "name: " << node->getName() << " [ "; auto nodeDesc = node->getSelectedPrimitiveDescriptor(); @@ -251,8 +252,7 @@ void serializeToCout(const Graph &graph) { auto& inConfs = nodeDesc->getConfig().inConfs; if (!inConfs.empty()) { std::cout << "in: " << inConfs.front().getMemDesc()->getPrecision().get_type_name() - << "/l=" << inConfs.front().getMemDesc()->serializeFormat() - << "; "; + << "/l=" << inConfs.front().getMemDesc()->serializeFormat() << "; "; } auto& outConfs = nodeDesc->getConfig().outConfs; if (!outConfs.empty()) { @@ -260,11 +260,11 @@ void serializeToCout(const Graph &graph) { << "/l=" << outConfs.front().getMemDesc()->serializeFormat(); } } - std::cout << " ]" << std::endl; + std::cout << " ]" << std::endl; } } -void summary_perf(const Graph &graph) { +void summary_perf(const Graph& graph) { if (!graph.getGraphContext()) { return; } @@ -277,7 +277,7 @@ void summary_perf(const Graph &graph) { std::map perf_by_node; double total_avg = 0; uint64_t total = 0; - for (auto &node : graph.GetNodes()) { // important: graph.graphNodes are in topological order + for (auto& node : graph.GetNodes()) { // important: graph.graphNodes are in topological order double avg = node->PerfCounter().avg(); auto type = node->getTypeStr() + "_" + node->getPrimitiveDescriptorType(); auto name = node->getName(); @@ -296,59 +296,60 @@ void summary_perf(const Graph &graph) { perf_by_node[node] = avg; } - if (total_avg < 1) return; + if (total_avg < 1) + return; std::cout << "======= ENABLE_DEBUG_CAPS:OV_CPU_SUMMARY_PERF ======" << std::endl; - std::cout << "Summary of " << graph.GetName() << " @" << std::hash{}(reinterpret_cast(&graph)) << std::endl; + std::cout << "Summary of " << graph.GetName() << " @" << std::hash{}(reinterpret_cast(&graph)) + << std::endl; std::cout << " Total(us): " << (uint64_t)(total) << std::endl; std::cout << " Total_avg(us): " << (uint64_t)(total_avg) << std::endl; { std::cout << " perf_by_type:" << std::endl; - std::vector > A; + std::vector> A; for (auto& it : perf_by_type) A.push_back(it); - sort(A.begin(), A.end(), - [](std::pair& a, - std::pair& b){ - return a.second > b.second; - }); + sort(A.begin(), A.end(), [](std::pair& a, std::pair& b) { + return a.second > b.second; + }); for (auto& it : A) { std::stringstream ss; - int percentage = static_cast(it.second*100/total_avg); - if (percentage == 0) break; - ss << std::setw(10) << std::right << percentage << " % : " << std::setw(8) << std::right << it.second << "(us) " << it.first << std::endl; + int percentage = static_cast(it.second * 100 / total_avg); + if (percentage == 0) + break; + ss << std::setw(10) << std::right << percentage << " % : " << std::setw(8) << std::right << it.second + << "(us) " << it.first << std::endl; std::cout << ss.str(); } } { std::cout << " perf_by_node:" << std::endl; - std::vector > A; + std::vector> A; for (auto& it : perf_by_node) A.push_back(it); - sort(A.begin(), A.end(), - [](std::pair& a, - std::pair& b){ + sort(A.begin(), A.end(), [](std::pair& a, std::pair& b) { return a.second > b.second; }); for (auto& it : A) { std::stringstream ss; - auto percentage = it.second*100/total_avg; + auto percentage = it.second * 100 / total_avg; auto node = it.first; - if (node->PerfCounter().count() == 0) continue; - if (node->PerfCounter().avg() < 1) continue; + if (node->PerfCounter().count() == 0) + continue; + if (node->PerfCounter().avg() < 1) + continue; ss << std::setw(10) << std::right << std::fixed << std::setprecision(2) << percentage << " % " - << std::setw(8) << std::right << node->PerfCounter().avg() << "(us)x" << node->PerfCounter().count() - << " #" << node->getExecIndex() - << " " << node->getName() - << " " << node->getTypeStr() + "_" + node->getPrimitiveDescriptorType() << std::endl; + << std::setw(8) << std::right << node->PerfCounter().avg() << "(us)x" << node->PerfCounter().count() + << " #" << node->getExecIndex() << " " << node->getName() << " " + << node->getTypeStr() + "_" + node->getPrimitiveDescriptorType() << std::endl; std::cout << ss.str(); } } } -void average_counters(const Graph &graph) { +void average_counters(const Graph& graph) { /** * @todo improve logic for a graph with inner graphs: * - collect counters only for the outer graph if full path is specified @@ -359,7 +360,8 @@ void average_counters(const Graph &graph) { static int graphIndex = 0; std::ofstream file; - std::string fileName = graph.getConfig().debugCaps.averageCountersPath + "_" + std::to_string(graphIndex++) + ".csv"; + std::string fileName = + graph.getConfig().debugCaps.averageCountersPath + "_" + std::to_string(graphIndex++) + ".csv"; file.open(fileName); @@ -379,18 +381,14 @@ void average_counters(const Graph &graph) { const auto cpuTime = toMs(avg); const auto realTime = cpuTime; - file << node->getName() << ";" - << status << ";" - << node->getTypeStr() << ";" - << node->getPrimitiveDescriptorType() << ";" - << realTime << ";" - << cpuTime << ";" - << "\n"; + file << node->getName() << ";" << status << ";" << node->getTypeStr() << ";" + << node->getPrimitiveDescriptorType() << ";" << realTime << ";" << cpuTime << ";" + << "\n"; return avg; }; - for (auto &node : graph.GetNodes()) { + for (auto& node : graph.GetNodes()) { if (node->isConstant()) continue; @@ -399,11 +397,12 @@ void average_counters(const Graph &graph) { const auto totalMs = toMs(total); - file << "Total;;;;" << totalMs << ";" << totalMs << ";" << "\n"; + file << "Total;;;;" << totalMs << ";" << totalMs << ";" + << "\n"; file.close(); } #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_dumper.h b/src/plugins/intel_cpu/src/graph_dumper.h index 417db7e4c3cdc5..40af2fd44c61e6 100644 --- a/src/plugins/intel_cpu/src/graph_dumper.h +++ b/src/plugins/intel_cpu/src/graph_dumper.h @@ -4,19 +4,19 @@ #pragma once -#include "graph.h" - #include +#include "graph.h" + namespace ov { namespace intel_cpu { -std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph &graph); +std::shared_ptr dump_graph_as_ie_ngraph_net(const Graph& graph); #ifdef CPU_DEBUG_CAPS -void serialize(const Graph &graph); -void summary_perf(const Graph &graph); -void average_counters(const Graph &graph); -#endif // CPU_DEBUG_CAPS +void serialize(const Graph& graph); +void summary_perf(const Graph& graph); +void average_counters(const Graph& graph); +#endif // CPU_DEBUG_CAPS -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 61590b8691f4b2..fe0df309dc32f1 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -4,6 +4,7 @@ #include "graph_optimizer.h" +#include "cpu_types.h" #include "dnnl_extension_utils.h" #include "nodes/bin_conv.h" #include "nodes/common/cpu_convert.h" @@ -22,28 +23,26 @@ #include "nodes/transpose.h" #include "onednn/dnnl.h" #include "openvino/opsets/opset1.hpp" -#include "cpu_types.h" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" // WA for xbyak.h #ifdef _WIN32 -# ifndef _WINSOCKAPI_ -# define _WINSOCKAPI_ -# endif -# ifndef _WINSOCK2API_ -# define _WINSOCK2API_ -#endif +# ifndef _WINSOCKAPI_ +# define _WINSOCKAPI_ +# endif +# ifndef _WINSOCK2API_ +# define _WINSOCK2API_ +# endif #endif -#include "cpu/x64/cpu_isa_traits.hpp" - -#include +#include #include #include #include -#include +#include +#include "cpu/x64/cpu_isa_traits.hpp" #include "itt.h" #include "memory_desc/cpu_memory_desc_utils.h" @@ -55,11 +54,15 @@ namespace intel_cpu { GraphOptimizer::GraphOptimizer() {} -void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { +void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) { // For conv with input zp, canBeExecutedInInt8() check has dependency on input zero point check. - // Also zero point node is the input of computing-intensive nodes. Most others fusing are the output of computing-intensive nodes. - // So Locate the FuseConvolutionAndZeroPoints() as the first optimization. - OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "ApplyCommonGraphOptimizations", "FuseConvolutionAndZeroPoints"); + // Also zero point node is the input of computing-intensive nodes. Most others fusing are the output of + // computing-intensive nodes. So Locate the FuseConvolutionAndZeroPoints() as the first optimization. + OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, + taskChain, + itt::domains::intel_cpu_LT, + "ApplyCommonGraphOptimizations", + "FuseConvolutionAndZeroPoints"); FuseConvolutionAndZeroPoints(graph); graph.RemoveDroppedNodes(); @@ -67,10 +70,6 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { FuseConvMatmulFCDeconvAndDQScales(graph); graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndWeightsDecompression"); - FuseFCAndWeightsDecompression(graph); - graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias"); FuseConvolutionMatMulDeconvAndBias(graph); graph.RemoveDroppedNodes(); @@ -191,7 +190,7 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { graph.RemoveDroppedEdges(); } -void GraphOptimizer::ApplyImplSpecificGraphOptimizations(Graph &graph) { +void GraphOptimizer::ApplyImplSpecificGraphOptimizations(Graph& graph) { OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "GraphOptimizer::ApplyImplSpecificGraphOptimizations"); DropDoubleReorders(graph); @@ -206,7 +205,7 @@ void GraphOptimizer::ApplyImplSpecificGraphOptimizations(Graph &graph) { graph.RemoveDroppedEdges(); } -void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { +void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isDQScaleGraphPattern = [](NodePtr node) { @@ -215,14 +214,12 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { } auto parentNode = node->getParentEdgeAt(0)->getParent(); auto scaleNode = node->getParentEdgeAt(1)->getParent(); - if (!(parentNode->getType() == Type::Convolution - || parentNode->getType() == Type::MatMul - || parentNode->getType() == Type::Deconvolution - || parentNode->getType() == Type::FullyConnected)) + if (!(parentNode->getType() == Type::Convolution || parentNode->getType() == Type::MatMul || + parentNode->getType() == Type::Deconvolution)) return false; if (!scaleNode->isConstant()) return false; - //Only Fusing scales for INT8 precision. + // Only Fusing scales for INT8 precision. if (!parentNode->canBeExecutedInInt8()) return false; return (parentNode->getParentEdges().size() == 2); @@ -238,8 +235,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { if (!node->getFusedWith().empty() || !scales->getFusedWith().empty()) return false; - const auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(), - nodeOutDims.size()); + const auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(), nodeOutDims.size()); if (nodeOutDims.size() != scalesDims.size() || scalesDims.size() < 2) return false; @@ -266,7 +262,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { if (scalesData == nullptr) OPENVINO_THROW("scalesBlob has not allocated buffer"); auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(), - node->getOutputShapeAtPort(0).getDims().size()); + node->getOutputShapeAtPort(0).getDims().size()); auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), 1, std::multiplies()); node->fuseDQScales(scalesData, scaleSize); return true; @@ -274,16 +270,21 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto mul = graphNodes[i]; - if (!isDQScaleGraphPattern(mul)) continue; + if (!isDQScaleGraphPattern(mul)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvMatmulFCDeconvAndDQScales); auto node = mul->getParentEdgeAt(0)->getParent(); auto scales = mul->getParentEdgeAt(1)->getParent(); - if (!scaleDimsCheck(node, scales)) continue; + if (!scaleDimsCheck(node, scales)) + continue; if (initializeDeQuantizedScales(node, scales)) { - DEBUG_LOG("GraphOptimizer##FusingDQ: Node ##", mul->getName(), " optimized as DQ scales of Node ##", node->getName()); + DEBUG_LOG("GraphOptimizer##FusingDQ: Node ##", + mul->getName(), + " optimized as DQ scales of Node ##", + node->getName()); node->addOriginalLayer(mul->getOriginalLayers()); auto p_edge = mul->getParentEdgeAt(1); graph.RemoveEdge(p_edge); @@ -292,258 +293,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { } } -void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) { - std::set supportedWeightsPrecisions{ - ov::element::u8, ov::element::i8, ov::element::nf4, ov::element::u4, ov::element::i4, ov::element::f4e2m1}; - const std::set supportedDataPrecisions{ov::element::f32, ov::element::bf16}; - auto expectedNode = [](NodePtr node, Type expectedType) { - return node->getType() == expectedType && node->getChildEdges().size() == 1; - }; - -#define SKIP_FUSION_FOR_NODE(node) \ - DEBUG_LOG("FuseFCAndWeightsDecompression can't be applied for node ", node->getName()); \ - continue - - if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2)) - return; - - auto& graphNodes = graph.GetNodes(); - for (size_t i = 0; i < graphNodes.size(); i++) { - const auto fcNode = std::dynamic_pointer_cast(graphNodes[i]); - if (fcNode == nullptr) - continue; - - auto parent = fcNode->getParentEdgeAt(1)->getParent(); - const bool withTranspose = parent->getType() == Type::Transpose; - const NodePtr transposeNode = withTranspose ? parent : nullptr; - if (transposeNode) - parent = transposeNode->getParentEdgeAt(0)->getParent(); - // Compressed weights can be shared between several FC layers - const bool is_shared_decompression = parent->getChildEdges().size() > 1; - - const bool withReshape = parent->getType() == Type::Reshape; - const auto reshapeNode = withReshape ? parent : nullptr; - if (reshapeNode) { - parent = reshapeNode->getParentEdgeAt(0)->getParent(); - } - - const auto multiplyNode = parent; - if (multiplyNode->getType() != Type::Eltwise || multiplyNode->getAlgorithm() != Algorithm::EltwiseMultiply || - !multiplyNode->isConstant()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - CPU_GRAPH_OPTIMIZER_SCOPE(FuseFCAndWeightsDecompression); - const auto mulParent1 = multiplyNode->getParentEdgeAt(1)->getParent(); - NodePtr multiplyParent, multiplyConvertNode, multiplyConstNode; - multiplyParent = mulParent1; - if (multiplyParent->getType() == Type::Convert) { - multiplyConvertNode = multiplyParent; - multiplyParent = multiplyConvertNode->getParentEdgeAt(0)->getParent(); - } - multiplyConstNode = multiplyParent; - if (multiplyConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const bool withMultiplyConvert = multiplyConvertNode != nullptr; - - const auto mulParent0 = multiplyNode->getParentEdgeAt(0)->getParent(); - const bool withSubtract = mulParent0->getAlgorithm() == Algorithm::EltwiseSubtract; - NodePtr subtractNode, subtractConvertNode, subtractConstNode; - if (withSubtract) { - subtractNode = mulParent0; - if (!expectedNode(subtractNode, Type::Eltwise)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - auto subtractParent = subtractNode->getParentEdgeAt(1)->getParent(); - if (subtractParent->getType() == Type::Convert) { - subtractConvertNode = subtractParent; - subtractParent = subtractConvertNode->getParentEdgeAt(0)->getParent(); - } - subtractConstNode = subtractParent; - if (subtractConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - const bool withSubtractConvert = subtractConvertNode != nullptr; - const auto convertNode = withSubtract ? subtractNode->getParentEdgeAt(0)->getParent() : mulParent0; - if (!expectedNode(convertNode, Type::Convert)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const auto weightsNode = convertNode->getParentEdgeAt(0)->getParent(); - if (weightsNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Precision limitations - if (supportedDataPrecisions.find(fcNode->getOriginalInputPrecisionAtPort(0)) == supportedDataPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (supportedWeightsPrecisions.find(weightsNode->getOriginalOutputPrecisionAtPort(0)) == supportedWeightsPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && - !one_of(subtractConstNode->getOriginalOutputPrecisionAtPort(0), weightsNode->getOriginalOutputPrecisionAtPort(0), ov::element::f32)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Shape limitations - const auto weightsShape = weightsNode->getOutputShapeAtPort(0); - if (weightsShape != multiplyNode->getOutputShapeAtPort(0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (reshapeNode && (reshapeNode->getInputShapeAtPort(0).getRank() != 3 || reshapeNode->getOutputShapeAtPort(0).getRank() != 2)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - VectorDims decompressionConstShape; - const auto fcInputWeightsShape = fcNode->getInputShapeAtPort(1); - int groupNum = 1; - // Ordinary case: one decompression group - if (fcInputWeightsShape.getRank() == weightsShape.getRank()) { - const auto& out_channels = fcInputWeightsShape.getDims()[0]; - decompressionConstShape = withTranspose ? VectorDims{1, out_channels} : VectorDims{out_channels, 1}; - } else { - // Group decompression case: last 3 dimension (there could be also prepending '1's in the beginning) of weights shape must be: - // [N, G, O], if transpose = true - // [O, N, G], otherwise. - // O - output channels - // N - number of groups - // G - group size - const auto& weights_dims = weightsShape.getStaticDims(); - const auto& N = withTranspose ? *(weights_dims.rbegin() + 2) : *(weights_dims.rbegin() + 1); - const auto& O = withTranspose ? *weights_dims.rbegin() : *(weights_dims.rbegin() + 2); - // Group decompression is applied by O and N dims - decompressionConstShape = withTranspose ? VectorDims{N, 1, O} : VectorDims{O, N, 1}; - groupNum = N; - } - - auto check_decompression_shape = [&decompressionConstShape](const VectorDims& shape_to_check) { - if (shape_to_check.size() > decompressionConstShape.size()) - return false; - if (std::all_of(shape_to_check.begin(), shape_to_check.end(), [](Dim x) { return x == 1; })) - return true; - const auto comparison_start_pos = decompressionConstShape.size() - shape_to_check.size(); - // in case of different ranks shapes are compared taking into account ranks numpy broadcasting - return std::equal(shape_to_check.begin(), shape_to_check.end(), decompressionConstShape.begin() + comparison_start_pos); - }; - if (!check_decompression_shape(multiplyConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && !check_decompression_shape(subtractConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - const size_t OC = fcInputWeightsShape.getDims()[0]; - const size_t IC = fcInputWeightsShape.getDims()[1]; - // HW specific shape limitations - if (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) && - fcNode->getOriginalInputPrecisionAtPort(0) == ov::element::bf16) { - // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a current solution conditions below are copied - // from OneDNN to make sure correct IP impl will be used since fallback one doesn't support weights decompression feature. - size_t simdWidth = 16; - size_t vnniFactor = 2; - size_t maxSize = 512; - auto amxRow = vnniFactor * simdWidth; - - if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - // OneDNN IP primitive provides limited decompression params support - if (IC % groupNum != 0 || IC / groupNum < 4 || OC == 1) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Fusion processing - auto *multiplyInputNode = dynamic_cast(multiplyConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", multiplyConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionMultiply(multiplyInputNode->getMemoryPtr()); - - if (withSubtract) { - auto *subtractInputNode = dynamic_cast(subtractConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", subtractConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionSubtract(subtractInputNode->getMemoryPtr()); - } - - fcNode->addOriginalLayer(multiplyNode->getOriginalLayers()); - fcNode->addOriginalLayer(convertNode->getOriginalLayers()); - if (withSubtract) - fcNode->addOriginalLayer(subtractNode->getOriginalLayers()); - if (withSubtractConvert) - fcNode->addOriginalLayer(subtractConvertNode->getOriginalLayers()); - if (withMultiplyConvert) - fcNode->addOriginalLayer(multiplyConvertNode->getOriginalLayers()); - - const auto& weightsPrecision = weightsNode->getOriginalOutputPrecisionAtPort(0); - if (withTranspose) { - transposeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - transposeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - if (withReshape) { - reshapeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - reshapeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - fcNode->setOriginalInputPrecisionAtPort(1, weightsPrecision); - - // If decompression subgraph is shared with other nodes, it mustn't be removed. - // In this case, the current FC is reconnected to the weights - if (is_shared_decompression) { - const auto weights_out_edge = weightsNode->getChildEdges()[0].lock(); - const auto fc_weights_path_edge = withTranspose ? transposeNode->getParentEdgeAt(0) - : fcNode->getParentEdgeAt(1); - const auto inNum = weights_out_edge->getInputNum(); - const auto outNum = fc_weights_path_edge->getOutputNum(); - graph.RemoveEdge(fc_weights_path_edge); - // In case of shared group decompression, Reshape node has to be copied for the current FC - if (withReshape) { - const auto& reshapeOutShape = reshapeNode->getOutputShapeAtPort(0).getStaticDims(); - auto reshapeConst = std::make_shared(ov::element::i32, - ov::Shape{reshapeOutShape.size()}, - reshapeOutShape); - auto reshapeDummyInput = std::make_shared(reshapeNode->getOriginalInputPrecisionAtPort(0), - reshapeNode->getInputShapeAtPort(0).toPartialShape()); - const auto reshape = std::make_shared(reshapeDummyInput, reshapeConst, false); - reshape->set_friendly_name(reshapeNode->getName() + "_copy"); - const auto cpuReshape = std::make_shared(reshape, graph.getGraphContext()); - graph.InsertNode(weightsNode, withTranspose ? transposeNode : fcNode, cpuReshape, inNum, outNum, false); - const auto cpuReshapeConst = std::make_shared(reshapeConst, graph.getGraphContext()); - graph.AddNode(cpuReshapeConst); - graph.CreateEdge(cpuReshapeConst, cpuReshape, 0, 1); - } else { - graph.CreateEdge(weightsNode, withTranspose ? transposeNode : fcNode, inNum, outNum); - } - } else { - // If decompression subgraph is not shared with other nodes, it can be removed - if (withSubtract) - graph.RemoveEdge(subtractNode->getParentEdgeAt(1)); - if (withSubtractConvert) { - // SubtractConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = subtractConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(subtractConvertNode->getParentEdgeAt(0)); - } - graph.RemoveEdge(multiplyNode->getParentEdgeAt(1)); - if (withMultiplyConvert) { - // MultiplyConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = multiplyConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(multiplyConvertNode->getParentEdgeAt(0)); - } - - graph.DropNode(convertNode); - if (withSubtract) - graph.DropNode(subtractNode); - graph.DropNode(multiplyNode); - } - DEBUG_LOG("FuseFCAndWeightsDecompression finished for node ", fcNode->getName()); - } -#undef SKIP_FUSION_FOR_NODE -} - -void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { +void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](const NodePtr& node) { @@ -556,16 +306,14 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { return false; if (!deconv) - return (one_of(node->getType(), Type::Convolution, Type::MatMul, Type::FullyConnected) && - node->getParentEdges().size() == 2); + return (one_of(node->getType(), Type::Convolution, Type::MatMul) && node->getParentEdges().size() == 2); else return deconv->canFuseBias(); }; auto isSuitableChildNode = [&](const NodePtr& parentNode, const NodePtr& childNode) { - if (childNode->getAlgorithm() != Algorithm::EltwiseAdd - || !childNode->getFusedWith().empty() - || childNode->getParentEdges().size() != 2) + if (childNode->getAlgorithm() != Algorithm::EltwiseAdd || !childNode->getFusedWith().empty() || + childNode->getParentEdges().size() != 2) return false; auto biasPort = childNode->getParentEdgeAt(0)->getParent() == parentNode ? 1 : 0; @@ -574,10 +322,11 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { return false; const auto parentOutDims = parentNode->getOutputShapeAtPort(0).getDims(); - const auto biasDims = getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(), - parentOutDims.size()); - // TODO [NM]: Legacy ConvBias fusion transformation supports both per-tensor (via explicit broadcasing) and per-channel cases. - // Most of the real models contain per-channel bias, so we need to reavaluate the need to support per-tensor variant. + const auto biasDims = + getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(), parentOutDims.size()); + // TODO [NM]: Legacy ConvBias fusion transformation supports both per-tensor (via explicit broadcasing) and + // per-channel cases. Most of the real models contain per-channel bias, so we need to reavaluate the need to + // support per-tensor variant. if (parentOutDims.size() != biasDims.size() || biasDims.size() < 2) return false; @@ -613,9 +362,11 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (parent == parentNode) { for (size_t j = 0; j < childs.size(); j++) { @@ -625,7 +376,7 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { if (!child) continue; - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -640,7 +391,7 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { graph.CreateEdge(parent, child, inNum, outNum); } } else { - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -654,48 +405,57 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { // ONEDNN Conv, Deconv, FC would need the bias to be flatten into 1D tensor. // Usually the bias output shape would be normalized to align rank with Conv/Deconv/FC output. // To avoid duplicate reshape WA code in nodes, here we flatten the shape. - // Most bias nodes are const Input and bias memory primitive has been initialized as const memory when constructing CPU Input node. - // Const memory is not allowed to be modified after initialized. It means we can't redefine const bias memory primitive. - // So let's insert a reshape node to flatten the bias shape into 1D and const folding node will be executed during the compiling stage. - const bool needReshape = (targetNode->getType() != Type::MatMul && - biasOutputShape.getRank() != 1); + // Most bias nodes are const Input and bias memory primitive has been initialized as const memory when + // constructing CPU Input node. Const memory is not allowed to be modified after initialized. It means + // we can't redefine const bias memory primitive. So let's insert a reshape node to flatten the bias + // shape into 1D and const folding node will be executed during the compiling stage. + const bool needReshape = (targetNode->getType() != Type::MatMul && biasOutputShape.getRank() != 1); if (needReshape) { // Bias -> Reshape -> Conv/Deconv/FC const VectorDims flattenShape = {biasOutputShape.getElementsCount()}; // Construct Ngraph Reshape node and CPU Reshape node. - auto reshapeConstInput = std::make_shared(ov::element::i32, ov::Shape{1}, flattenShape); - auto reshapeDummyInput = std::make_shared( - biasNode->getOriginalOutputPrecisionAtPort(0), - biasOutputShape.toPartialShape()); - const auto reshape = std::make_shared(reshapeDummyInput, reshapeConstInput, false); + auto reshapeConstInput = + std::make_shared(ov::element::i32, ov::Shape{1}, flattenShape); + auto reshapeDummyInput = + std::make_shared(biasNode->getOriginalOutputPrecisionAtPort(0), + biasOutputShape.toPartialShape()); + const auto reshape = + std::make_shared(reshapeDummyInput, reshapeConstInput, false); reshape->set_friendly_name(biasNode->getName() + "_flatten_reshape"); - const auto cpuReshapeNode = std::make_shared(reshape, graph.getGraphContext()); + const auto cpuReshapeNode = + std::make_shared(reshape, graph.getGraphContext()); // Insert Reshape between bias node and Conv/Deconv/FC graph.InsertNode(biasNode, targetNode, cpuReshapeNode, inNum, outNum, false); // Insert the Reshape const input node and edge into CPU graph. - const auto cpuReshapeConstInput = std::make_shared(reshapeConstInput, graph.getGraphContext()); + const auto cpuReshapeConstInput = + std::make_shared(reshapeConstInput, graph.getGraphContext()); graph.AddNode(cpuReshapeConstInput); graph.CreateEdge(cpuReshapeConstInput, cpuReshapeNode, 0, 1); - DEBUG_LOG("GraphOptimizer##FusingBias:Flatten Bias node from shape ", PartialShape{biasOutputShape.getDims()}, - " to ", PartialShape{flattenShape}); + DEBUG_LOG("GraphOptimizer##FusingBias:Flatten Bias node from shape ", + PartialShape{biasOutputShape.getDims()}, + " to ", + PartialShape{flattenShape}); // Update bias output shape to be flatten shape. biasOutputShape = Shape{flattenShape}; } else { // Bias is connected as input edge. graph.CreateEdge(biasNode, targetNode, inNum, outNum); } - //Add the Bias inputshape into conv/FC/Deconv/Matmul. + // Add the Bias inputshape into conv/FC/Deconv/Matmul. targetNode->inputShapes.push_back(biasOutputShape); } } - DEBUG_LOG("GraphOptimizer##FusingBias:Node ##: ", childNode->getName(), " initialize as Bias of Node ##", parentNode->getName()); + DEBUG_LOG("GraphOptimizer##FusingBias:Node ##: ", + childNode->getName(), + " initialize as Bias of Node ##", + parentNode->getName()); parentNode->addOriginalLayer(childNode->getOriginalLayers()); parentNode->addOriginalInputPrecision(childNode->getOriginalInputPrecisionAtPort(biasPort)); graph.DropNode(childNode); } } -void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -741,7 +501,7 @@ void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) { childNode->fuseInto(parentNode); auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::Deconvolution) continue; @@ -753,7 +513,7 @@ void GraphOptimizer::FuseDeconvolutionAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { +void GraphOptimizer::FuseMultiplyAndAdd(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableSecondInput = [](const NodePtr& node, VectorDims dataDims) { @@ -765,9 +525,9 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { auto getChannelAxis = [](const VectorDims& dims) { auto channelAxis = -1; - for (size_t i = 0; i < dims.size(); i ++) { + for (size_t i = 0; i < dims.size(); i++) { if (dims[i] != 1) { - if (channelAxis != -1) // more than one axis is != 1 + if (channelAxis != -1) // more than one axis is != 1 return -1; else channelAxis = i; @@ -795,11 +555,13 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { }; auto isSuitableChildNode = [&](const NodePtr& parentNode, const NodePtr& childNode) { - if (childNode->getAlgorithm() != Algorithm::EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2) + if (childNode->getAlgorithm() != Algorithm::EltwiseAdd || !childNode->getFusedWith().empty() || + childNode->getParentEdges().size() != 2) return false; - return isSuitableSecondInput(childNode->getParentEdgeAt(1)->getParent(), childNode->getInputShapeAtPort(0).getDims()) && - parentNode->canFuse(childNode); + return isSuitableSecondInput(childNode->getParentEdgeAt(1)->getParent(), + childNode->getInputShapeAtPort(0).getDims()) && + parentNode->canFuse(childNode); }; auto parent = graphNodes.begin(); @@ -825,9 +587,11 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (parent == parentNode) { for (size_t j = 0; j < childs.size(); j++) { @@ -837,7 +601,7 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { if (!child) continue; - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -852,7 +616,7 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph &graph) { graph.CreateEdge(parent, child, inNum, outNum); } } else { - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -908,9 +672,11 @@ void GraphOptimizer::MergeConvertAndScaleShift(Graph& graph) { const auto parents = parentNode->parentEdges; for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (!parentNode->childEdges[0].lock()) continue; @@ -944,8 +710,8 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) { return; #endif - // This optimization fuses Convert (fp16 -> bf16/fp32) on weights directly to FC input to allow precision conversion handling based on internal logic - // (e.g. fuse conversion with weights reordering) + // This optimization fuses Convert (fp16 -> bf16/fp32) on weights directly to FC input to allow precision conversion + // handling based on internal logic (e.g. fuse conversion with weights reordering) auto& graphNodes = graph.GetNodes(); for (const auto& fullyConnected : graphNodes) { if (fullyConnected->getType() != Type::FullyConnected) { @@ -978,16 +744,13 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { return; #endif - // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with reordering in FC node + // This optimization allows us to avoid transposing the weights in Transpose node and do it directly along with + // reordering in FC node auto& graphNodes = graph.GetNodes(); auto isSuitablePattern = [](NodePtr parent) { - bool res = true && parent->getType() == Type::Transpose - && parent->getChildEdges().size() == 1 - && parent->getChildEdgeAt(0)->getOutputNum() == 1 - && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected - && parent->getOutputShapeAtPort(0).getRank() == 2 - && parent->isConstant(); + bool res = true && parent->getType() == Type::Transpose && parent->getChildEdges().size() == 1 && + parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected && parent->isConstant(); return res; }; @@ -1002,7 +765,7 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { } } -void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableConvNode = [](NodePtr node) { @@ -1035,9 +798,10 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { return false; // The plug-in doesn't support FP32 convolution with input/weights zero points. - // In case weights are in FP32 (or we have zero points on weights which are not supported by INT8 convolution) we cannot use - // INT8 implementation so we have to disable input zero points fusing as well. - if (parent1->getType() != Type::Input || !parent1->isConstant() || parent1->getOriginalOutputPrecisionAtPort(0) != ov::element::i8) { + // In case weights are in FP32 (or we have zero points on weights which are not supported by INT8 convolution) + // we cannot use INT8 implementation so we have to disable input zero points fusing as well. + if (parent1->getType() != Type::Input || !parent1->isConstant() || + parent1->getOriginalOutputPrecisionAtPort(0) != ov::element::i8) { return false; } @@ -1085,7 +849,7 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { if (zeroPointsData == nullptr) OPENVINO_THROW("zeroPointsBlob has not allocated buffer"); - auto zeroPointDataSize = parent0->getInputShapeAtPort(1).getDims()[1]; + auto zeroPointDataSize = parent0->getInputShapeAtPort(1).getDims()[1]; if (Shape::UNDEFINED_DIM == zeroPointDataSize) { return false; } @@ -1121,8 +885,10 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { auto OC = weightsConstantDims[0 + groupOffset]; auto IC = weightsConstantDims[1 + groupOffset]; - auto KD = weightsConstantDims.size() == (5 + groupOffset) ? weightsConstantDims[weightsConstantDims.size() - 3] : 1; - auto KH = weightsConstantDims.size() == (3 + groupOffset) ? 1 : weightsConstantDims[weightsConstantDims.size() - 2]; + auto KD = + weightsConstantDims.size() == (5 + groupOffset) ? weightsConstantDims[weightsConstantDims.size() - 3] : 1; + auto KH = + weightsConstantDims.size() == (3 + groupOffset) ? 1 : weightsConstantDims[weightsConstantDims.size() - 2]; auto KW = weightsConstantDims[weightsConstantDims.size() - 1]; for (size_t g = 0; g < G; g++) { @@ -1132,20 +898,19 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { for (size_t kd = 0; kd < KD; kd++) { for (size_t kh = 0; kh < KH; kh++) { for (size_t kw = 0; kw < KW; kw++) { - size_t widx = g * OC * IC * KD * KH * KW + - oc * IC * KD * KH * KW + - ic * KD * KH * KW + - kd * KH * KW + - kh * KW + - kw; + size_t widx = g * OC * IC * KD * KH * KW + oc * IC * KD * KH * KW + ic * KD * KH * KW + + kd * KH * KW + kh * KW + kw; auto w = static_cast(weightsPtr[widx]); - auto izp = !convNode->legacyInputZeroPoints.empty() ? static_cast(convNode->legacyInputZeroPoints[g * IC + ic]) : 0; + auto izp = !convNode->legacyInputZeroPoints.empty() + ? static_cast(convNode->legacyInputZeroPoints[g * IC + ic]) + : 0; a += w * izp; - auto wzp = !convNode->legacyWeightsZeroPoints.empty() ? - static_cast(convNode->legacyWeightsZeroPoints[g * OC + oc]) : 0; + auto wzp = !convNode->legacyWeightsZeroPoints.empty() + ? static_cast(convNode->legacyWeightsZeroPoints[g * OC + oc]) + : 0; a -= wzp * izp; } } @@ -1158,7 +923,8 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto conv = graphNodes[i]; - if (!isSuitableConvNode(conv)) continue; + if (!isSuitableConvNode(conv)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvolutionAndZeroPoints_ConvNode); @@ -1166,8 +932,10 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { auto weightsEltwise = conv->getParentEdgeAt(1)->getParent(); if (initializeInputZeroPoints(conv, dataEltwise, weightsEltwise)) { auto p_edge = dataEltwise->getParentEdgeAt(1); - DEBUG_LOG("[GraphOptimizer##FusingZeorPoint]:Eltwise Subtract Node ##", dataEltwise->getName(), - " is optimized as zeropoint of Conv ##", conv->getName()); + DEBUG_LOG("[GraphOptimizer##FusingZeorPoint]:Eltwise Subtract Node ##", + dataEltwise->getName(), + " is optimized as zeropoint of Conv ##", + conv->getName()); graph.RemoveEdge(p_edge); graph.DropNode(dataEltwise); initializeOutputCompensation(conv); @@ -1175,7 +943,7 @@ void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph &graph) { } } -void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1202,7 +970,7 @@ void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::FullyConnected) continue; @@ -1215,7 +983,7 @@ void GraphOptimizer::FuseFullyConnectedAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSutableParentNode = [](const NodePtr& node) { @@ -1242,7 +1010,7 @@ void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::MatMul) continue; @@ -1255,14 +1023,14 @@ void GraphOptimizer::FuseMatMulAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph& graph) { auto& graphNodes = graph.GetNodes(); - auto isConvolutionNode = [](const NodePtr &node) { + auto isConvolutionNode = [](const NodePtr& node) { return node->getType() == Type::Convolution; }; - auto is1x1Convolution = [](const std::shared_ptr &conv) { + auto is1x1Convolution = [](const std::shared_ptr& conv) { const auto weightRank = conv->getWeightDims().size(); return conv->getWeightDims()[weightRank - 1] == 1 && conv->getWeightDims()[weightRank - 2] == 1; }; @@ -1281,10 +1049,10 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { if (!conv->legacyWeightsZeroPoints.empty()) return false; - const auto &strides = conv->getStride(); - const auto &paddings = conv->getPaddingL(); - const auto &inDims = node->getInputShapeAtPort(0).getDims(); - const auto &outDims = node->getOutputShapeAtPort(0).getDims(); + const auto& strides = conv->getStride(); + const auto& paddings = conv->getPaddingL(); + const auto& inDims = node->getInputShapeAtPort(0).getDims(); + const auto& outDims = node->getOutputShapeAtPort(0).getDims(); bool isSupportedParams = conv->getGroupNum() == 1 && inDims.size() == 4 && dimsEqualStrong(inDims[inDims.size() - 1], outDims[outDims.size() - 1]) && @@ -1297,12 +1065,13 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { static_cast(paddings[paddings.size() - 1]), static_cast(paddings[paddings.size() - 2])) && !conv->canBeExecutedInInt8(); - if (!isSupportedParams) return false; + if (!isSupportedParams) + return false; return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild()); }; - auto isSuitableChildConvolution = [&](const NodePtr &parentNode, const NodePtr &childNode) { + auto isSuitableChildConvolution = [&](const NodePtr& parentNode, const NodePtr& childNode) { if (parentNode->isDropped() || childNode->isDropped()) return false; @@ -1317,15 +1086,19 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { if (convParent == nullptr) OPENVINO_THROW("Cannot cast to convolution node ", parentNode->getName()); - if (!everyone_is(ov::element::f32, convParent->getOriginalOutputPrecisionAtPort(0), convChild->getOriginalInputPrecisionAtPort(0), - convChild->getOriginalOutputPrecisionAtPort(0))) + if (!everyone_is(ov::element::f32, + convParent->getOriginalOutputPrecisionAtPort(0), + convChild->getOriginalInputPrecisionAtPort(0), + convChild->getOriginalOutputPrecisionAtPort(0))) return false; - auto parentOutputPrecision = !parentNode->fusedWith.empty() + auto parentOutputPrecision = + !parentNode->fusedWith.empty() ? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0) : parentNode->getOriginalOutputPrecisionAtPort(0); - auto childOutputPrecision = !childNode->fusedWith.empty() + auto childOutputPrecision = + !childNode->fusedWith.empty() ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0) : childNode->getOriginalOutputPrecisionAtPort(0); @@ -1361,7 +1134,7 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { return isSupportedParams; }; - auto isFusingWorthwhile = [&](const NodePtr &parentNode, const NodePtr &childNode) { + auto isFusingWorthwhile = [&](const NodePtr& parentNode, const NodePtr& childNode) { if (!childNode->inputShapes[0].isStatic() || !childNode->outputShapes[0].isStatic()) { return false; } @@ -1372,7 +1145,7 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { int L3_cache_size = dnnl::utils::get_cache_size(3, false); int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize; - int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize; + int dw_conv_output_size = outDims[0] * outDims[1] * outDims[2] * outDims[3] * elemSize; auto parentConvolutionNode = std::dynamic_pointer_cast(parentNode); if (parentConvolutionNode == nullptr) @@ -1385,19 +1158,23 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { }; for (size_t i = 0; i < graphNodes.size(); i++) { - if (!isConvolutionNode(graphNodes[i])) continue; + if (!isConvolutionNode(graphNodes[i])) + continue; auto parentConvNode = graphNodes[i]; - if (!isSuitableParentConvolution(parentConvNode)) continue; + if (!isSuitableParentConvolution(parentConvNode)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvolutionAndDWConvolution_ParentConv); auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild(); - if (!isSuitableChildConvolution(parentConvNode, childConvNode)) continue; + if (!isSuitableChildConvolution(parentConvNode, childConvNode)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseConvolutionAndDWConvolution_ChildConv); - if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue; + if (!isFusingWorthwhile(parentConvNode, childConvNode)) + continue; parentConvNode->addFusedNode(childConvNode); @@ -1411,12 +1188,12 @@ void GraphOptimizer::FuseConvolutionAndDWConvolution(Graph &graph) { } // TODO [NM]: unite with FuseConvolutionAndSimpleOperation -void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { - return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && node->getChildEdges().size() == 1 && - node->getOriginalOutputPrecisionAtPort(0) == ov::element::f32; + return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && + node->getChildEdges().size() == 1 && node->getOriginalOutputPrecisionAtPort(0) == ov::element::f32; }; auto parent = graphNodes.begin(); @@ -1455,7 +1232,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap parentNode->addFusedNode(fuseCandidate); parentNode->addOriginalLayer(fuseCandidate->getOriginalLayers()); auto parentEdges = fuseCandidate->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent() == childNode) continue; @@ -1466,11 +1243,12 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &grap } } -void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { - return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && node->getChildEdges().size() == 1; + return (node->getType() == Type::Convolution || node->getType() == Type::BinaryConvolution) && + node->getChildEdges().size() == 1; }; auto parent = graphNodes.begin(); @@ -1495,7 +1273,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == parentNodeType) continue; @@ -1508,7 +1286,7 @@ void GraphOptimizer::FuseConvolutionAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FusePoolingAndFakeQuantize(Graph &graph) { +void GraphOptimizer::FusePoolingAndFakeQuantize(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1526,12 +1304,14 @@ void GraphOptimizer::FusePoolingAndFakeQuantize(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto parent = graphNodes[i]; - if (!isSuitableParentNode(parent)) continue; + if (!isSuitableParentNode(parent)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePoolingAndFakeQuantize_ParentNode); auto child = parent->getChildEdgeAt(0)->getChild(); - if (!isSuitableChildNode(child)) continue; + if (!isSuitableChildNode(child)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePoolingAndFakeQuantize_ChildNode); @@ -1558,14 +1338,14 @@ void GraphOptimizer::FusePoolingAndFakeQuantize(Graph &graph) { * @param child node we try to find * @return True if child is one of data supplier */ -static bool is_data_dependency(const std::shared_ptr &parent, - const std::shared_ptr &child) { +static bool is_data_dependency(const std::shared_ptr& parent, const std::shared_ptr& child) { std::set visited; - std::list nextLayers {parent.get()}; + std::list nextLayers{parent.get()}; for (; !nextLayers.empty();) { auto layer = *nextLayers.begin(); - if (layer == child.get()) return true; + if (layer == child.get()) + return true; for (auto& oe : layer->getChildEdges()) { auto nn = oe.lock()->getChild(); if (visited.find(nn.get()) == visited.end()) { @@ -1616,19 +1396,18 @@ static bool is_data_dependency(const std::shared_ptr &parent, * *** */ -void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) { +void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph& graph) { #if !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64) return; #endif - auto &graphNodes = graph.GetNodes(); + auto& graphNodes = graph.GetNodes(); auto isFusingSupported = [&](NodePtr conv, NodePtr child) { - return child->getType() == Type::Eltwise && - DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm()); + return child->getType() == Type::Eltwise && DnnlExtensionUtils::isUnarySupportedAsPostOp(child->getAlgorithm()); }; - for (auto &graphNode : graphNodes) { + for (auto& graphNode : graphNodes) { const auto eltwiseNode = std::dynamic_pointer_cast(graphNode); if (graphNode->getType() != Type::Eltwise || graphNode->getAlgorithm() != Algorithm::EltwiseAdd || !eltwiseNode || eltwiseNode->isWithBroadcast()) @@ -1642,12 +1421,12 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) auto parent1 = graphNode->getParentEdgeAt(0)->getParent(); auto parent2 = graphNode->getParentEdgeAt(1)->getParent(); - bool isSuitableParent1 = parent1->getType() == Type::Convolution - || parent1->getType() == Type::BinaryConvolution; - bool isSuitableParent2 = parent2->getType() == Type::Convolution - || parent2->getType() == Type::BinaryConvolution; + bool isSuitableParent1 = + parent1->getType() == Type::Convolution || parent1->getType() == Type::BinaryConvolution; + bool isSuitableParent2 = + parent2->getType() == Type::Convolution || parent2->getType() == Type::BinaryConvolution; - auto canFuseSum = [](node::BinaryConvolution *binConv, NodePtr fuseCandidate) { + auto canFuseSum = [](node::BinaryConvolution* binConv, NodePtr fuseCandidate) { if (binConv->getImplType() == impl_desc_type::ref) return false; @@ -1666,12 +1445,12 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) return false; }; - auto* binConvNode1 = dynamic_cast(parent1.get()); + auto* binConvNode1 = dynamic_cast(parent1.get()); if (binConvNode1) { isSuitableParent1 = isSuitableParent1 && canFuseSum(binConvNode1, graphNode); } - auto* binConvNode2 = dynamic_cast(parent2.get()); + auto* binConvNode2 = dynamic_cast(parent2.get()); if (binConvNode2) { isSuitableParent2 = isSuitableParent2 && canFuseSum(binConvNode2, graphNode); } @@ -1685,7 +1464,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) return false; }; - auto* convNode1 = dynamic_cast(parent1.get()); + auto* convNode1 = dynamic_cast(parent1.get()); if (convNode1) { if (!convNode1->canBeExecutedInInt8()) { isSuitableParent1 = isSuitableParent1 && convNode1->getFusedWith().empty(); @@ -1694,7 +1473,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) } } - auto* convNode2 = dynamic_cast(parent2.get()); + auto* convNode2 = dynamic_cast(parent2.get()); if (convNode2) { if (!convNode2->canBeExecutedInInt8()) { isSuitableParent2 = isSuitableParent2 && convNode2->getFusedWith().empty(); @@ -1713,9 +1492,9 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) // not merged operation (peerNode) has to be in low precision const auto isBranchQuantized = [](const NodePtr& branchParent) { const auto& fused = branchParent->getFusedWith(); - const auto branchPrecision = fused.empty() ? - branchParent->getOriginalOutputPrecisionAtPort(0) : - fused[fused.size() - 1]->getOriginalOutputPrecisionAtPort(0); + const auto branchPrecision = fused.empty() + ? branchParent->getOriginalOutputPrecisionAtPort(0) + : fused[fused.size() - 1]->getOriginalOutputPrecisionAtPort(0); return (branchPrecision == ov::element::i8) || (branchPrecision == ov::element::u8); }; @@ -1785,15 +1564,16 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) // be overwritten. Should verify that all other consumer already read it and // we can spoil input data. // TODO: rewrite once we add "Inplace" reporting mechanism - for (auto & edge : peerNode->getChildEdges()) { + for (auto& edge : peerNode->getChildEdges()) { if (!fuse_allowed) break; fuse_allowed &= is_data_dependency(edge.lock()->getChild(), sum); } - if (!fuse_allowed) continue; + if (!fuse_allowed) + continue; if (graphNode->getChildEdges().size() == 1 && - isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) { + isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) { auto relu_shared = graphNode->getChildEdgeAt(0)->getChild(); lastNode = relu_shared; if (mergedConv->isConstant() && !lastNode->isConstant()) @@ -1803,8 +1583,8 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) lastNode->fuseInto(mergedConv); - if (mergedConv->fusedWith.size() > 0 && - (mergedConv->fusedWith[0]->getType() == Type::Convolution || mergedConv->fusedWith[0]->getType() == Type::BinaryConvolution)) { + if (mergedConv->fusedWith.size() > 0 && (mergedConv->fusedWith[0]->getType() == Type::Convolution || + mergedConv->fusedWith[0]->getType() == Type::BinaryConvolution)) { // Merged with DW_conv. Shape may change mergedConv->inputShapes.push_back(mergedConv->fusedWith[0]->getOutputShapeAtPort(0)); } else { @@ -1835,7 +1615,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) graph.CreateEdge(peerNode, mergedConv, peer_port, childPort); std::vector edges_to_reconnect = lastNode->getChildEdges(); - for (auto &edge_w : edges_to_reconnect) { + for (auto& edge_w : edges_to_reconnect) { auto edge = edge_w.lock(); auto child = edge->getChild(); int idxParent = edge->getInputNum(); @@ -1855,7 +1635,7 @@ void GraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(Graph &graph) } } -void GraphOptimizer::FuseMVNAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseMVNAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1882,7 +1662,7 @@ void GraphOptimizer::FuseMVNAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::MVN) continue; @@ -1895,7 +1675,7 @@ void GraphOptimizer::FuseMVNAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1904,8 +1684,8 @@ void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { auto isSuitableChildNode = [&](NodePtr parentNode, NodePtr childNode) { // Avoid cycle dependencies - for (auto &childParentEdge : childNode->getParentEdges()) { - for (auto &parentParentEdge : parentNode->getParentEdges()) { + for (auto& childParentEdge : childNode->getParentEdges()) { + for (auto& parentParentEdge : parentNode->getParentEdges()) { if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent()) return false; } @@ -1941,7 +1721,7 @@ void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::Interpolate) continue; @@ -1954,7 +1734,7 @@ void GraphOptimizer::FuseInterpolateAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -1981,7 +1761,7 @@ void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::NormalizeL2) continue; @@ -1994,7 +1774,7 @@ void GraphOptimizer::FuseNormalizeL2AndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseReduceAndSimpleOperation(Graph &graph) { +void GraphOptimizer::FuseReduceAndSimpleOperation(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -2021,7 +1801,7 @@ void GraphOptimizer::FuseReduceAndSimpleOperation(Graph &graph) { if (childNode->getType() == Type::FakeQuantize || childNode->getType() == Type::Eltwise) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge == nullptr) OPENVINO_THROW("Cannot get parent edge ", childNode->getName()); @@ -2036,7 +1816,7 @@ void GraphOptimizer::FuseReduceAndSimpleOperation(Graph &graph) { } } -void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { +void GraphOptimizer::FuseEltwiseAndSimple(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { @@ -2046,14 +1826,14 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { auto isSuitableChildNode = [&](NodePtr parentNode, NodePtr childNode) { if (parentNode->isConstant() && !childNode->isConstant()) return false; - for (auto &childParentEdge : childNode->getParentEdges()) { + for (auto& childParentEdge : childNode->getParentEdges()) { // WA to prevent unsupported reorder exception issue in some cases if (childParentEdge.lock()->getParent()->getType() == Type::Split) { return false; } // Avoid cycle dependencies - for (auto &parentParentEdge : parentNode->getParentEdges()) { + for (auto& parentParentEdge : parentNode->getParentEdges()) { if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent()) return false; } @@ -2077,7 +1857,8 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { auto childNode = parentNode->getChildEdgeAt(0)->getChild(); - if ((parentNode->isDynamicNode() && !childNode->isDynamicNode()) || (!parentNode->isDynamicNode() && childNode->isDynamicNode())) { + if ((parentNode->isDynamicNode() && !childNode->isDynamicNode()) || + (!parentNode->isDynamicNode() && childNode->isDynamicNode())) { parent++; continue; } @@ -2093,7 +1874,7 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { if (childNode->getType() == Type::FakeQuantize) { auto parentEdges = childNode->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (p_edge->getParent()->getType() == Type::Eltwise) continue; @@ -2109,9 +1890,11 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { for (size_t i = 0; i < parents.size(); i++) { auto p_edge = parents[i].lock(); - if (!p_edge) continue; + if (!p_edge) + continue; auto parent = p_edge->getParent(); - if (!parent) continue; + if (!parent) + continue; if (parent == parentNode) { for (size_t j = 0; j < children.size(); j++) { @@ -2121,7 +1904,7 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { if (!child) continue; - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; if (remEdge) { inNum = remEdge->getInputNum(); @@ -2137,7 +1920,7 @@ void GraphOptimizer::FuseEltwiseAndSimple(Graph &graph) { graph.CreateEdge(parent, child, inNum, outNum); } } else { - EdgePtr &remEdge = p_edge; + EdgePtr& remEdge = p_edge; int inNum = 0; int outNum = parentNode->getParentEdges().size(); if (remEdge) { @@ -2228,15 +2011,14 @@ void GraphOptimizer::ShareReorders(Graph& graph) { } } -void GraphOptimizer::DropDoubleReorders(Graph &graph) { +void GraphOptimizer::DropDoubleReorders(Graph& graph) { std::set processed; auto& nodes = graph.GetNodes(); for (size_t i = 0; i < nodes.size(); i++) { auto node = nodes[i]; - if (processed.find(node) == processed.end() && node->getType() == Type::Reorder - && node->getChildEdges().size() == 1 - && node->getChildEdgeAt(0)->getChild()->getType() == Type::Reorder ) { + if (processed.find(node) == processed.end() && node->getType() == Type::Reorder && + node->getChildEdges().size() == 1 && node->getChildEdgeAt(0)->getChild()->getType() == Type::Reorder) { auto nextNode = node->getChildEdgeAt(0)->getChild(); Reorder* n = dynamic_cast(node.get()); if (n == nullptr) @@ -2261,7 +2043,8 @@ void GraphOptimizer::DropDoubleReorders(Graph &graph) { if (cur->getChild() == c) edge = cur; } - if (!edge) OPENVINO_THROW("Inappropriate graph processing"); + if (!edge) + OPENVINO_THROW("Inappropriate graph processing"); std::string layerName = edge->getParent()->getName() + "_ScaleReorder_" + edge->getChild()->getName(); graph.InsertReorder(edge, layerName, n->getInput(), nn->getOutput(), false); @@ -2270,11 +2053,12 @@ void GraphOptimizer::DropDoubleReorders(Graph &graph) { } } -void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { +void GraphOptimizer::FuseClampAndFakeQuantize(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableClampNode = [](NodePtr node) { - return node->getType() == Type::Eltwise && node->getChildEdges().size() == 1 && node->getAlgorithm() == Algorithm::EltwiseClamp; + return node->getType() == Type::Eltwise && node->getChildEdges().size() == 1 && + node->getAlgorithm() == Algorithm::EltwiseClamp; }; auto isSuitableFakeQuantizeNode = [](NodePtr node) { @@ -2282,7 +2066,7 @@ void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { }; auto fuseClampAndFakeQuantizeNodes = [](NodePtr parent, NodePtr child) { - auto* eltwiseNode = dynamic_cast(parent.get()); + auto* eltwiseNode = dynamic_cast(parent.get()); if (eltwiseNode == nullptr) OPENVINO_THROW("Cannot cast ", parent->getName(), " to Eltwise node"); @@ -2308,12 +2092,14 @@ void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto parent = graphNodes[i]; - if (!isSuitableClampNode(parent)) continue; + if (!isSuitableClampNode(parent)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseClampAndFakeQuantize_ClalmpNode); auto child = parent->getChildEdgeAt(0)->getChild(); - if (!isSuitableFakeQuantizeNode(child)) continue; + if (!isSuitableFakeQuantizeNode(child)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FuseClampAndFakeQuantize_QuantizeNode); @@ -2323,7 +2109,7 @@ void GraphOptimizer::FuseClampAndFakeQuantize(Graph &graph) { } } -void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { +void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto getNonConstPort = [](const NodePtr& node) { @@ -2341,11 +2127,12 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { }; auto isSuitableScaleShiftNode = [getNonConstPort](const NodePtr& node) { - if (!one_of(node->getAlgorithm(), Algorithm::EltwiseAdd, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseDivide, - Algorithm::EltwiseMulAdd)) + if (!one_of(node->getAlgorithm(), + Algorithm::EltwiseAdd, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseDivide, + Algorithm::EltwiseMulAdd)) return false; const auto nonConstPort = getNonConstPort(node); @@ -2375,7 +2162,7 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { const NodePtr eltwiseInput = parentEltwise->getParentEdgeAt(getNonConstPort(parent))->getParent(); std::tie(scalesBuffer, shiftsBuffer) = parentEltwise->getScalesAndShifts(eltwiseInput.get()); - const auto &outputShape = child->getOutputShapeAtPort(0); + const auto& outputShape = child->getOutputShapeAtPort(0); VectorDims outputDims = outputShape.getDims(); // We need to compute explicitly port with unfolded parent, @@ -2436,7 +2223,7 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { std::vector zeroShift(newInputScale.size(), 0.f); const auto isSubnormal = [](const float value) { - const uint32_t *u32data = reinterpret_cast(&value); + const uint32_t* u32data = reinterpret_cast(&value); return (*u32data) && (((*u32data) & (0xFF << 23)) == 0); }; @@ -2478,18 +2265,20 @@ void GraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph) { for (size_t i = 0; i < graphNodes.size(); i++) { auto parent = graphNodes[i]; - if (!isSuitableScaleShiftNode(parent)) continue; + if (!isSuitableScaleShiftNode(parent)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePerformedAsScaleShiftAndFakeQuantize_ShiftNode); auto child = parent->getChildEdgeAt(0)->getChild(); - if (!isSuitableFakeQuantizeNode(child)) continue; + if (!isSuitableFakeQuantizeNode(child)) + continue; CPU_GRAPH_OPTIMIZER_SCOPE(FusePerformedAsScaleShiftAndFakeQuantize_QuantizeNode); if (fuseScaleShiftAndFakeQuantizeNodes(parent, child)) { auto parentEdges = parent->parentEdges; - for (auto &parentEdge : parentEdges) { + for (auto& parentEdge : parentEdges) { auto p_edge = parentEdge.lock(); if (!p_edge->getParent()->isConstant()) continue; @@ -2613,7 +2402,12 @@ void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph, transposeNode->getName(), " is not a transpose node"); - const auto& inOrder = transposeNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as()->getOrder(); + const auto& inOrder = transposeNode->getSelectedPrimitiveDescriptor() + ->getConfig() + .inConfs[0] + .getMemDesc() + ->as() + ->getOrder(); const auto& outOrder = reorderOutDesc->as()->getOrder(); // Permutation must be set and reorder mustn't be optimized in 2 cases: // 1. Transpose has blocked input & non-blocked output @@ -2629,11 +2423,13 @@ void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph, } } - std::string reorderName = nodeBeforeSequence->getName() + "_" + Reorder::getReorderArgs(*reorderInDesc, *reorderOutDesc); + std::string reorderName = + nodeBeforeSequence->getName() + "_" + Reorder::getReorderArgs(*reorderInDesc, *reorderOutDesc); if (isOptimized) - reorderName += "_fake"; + reorderName += "_fake"; DEBUG_LOG("mergeTransposeAndReorder ", parentNode->getName(), " and ", childNode->getName(), " -> ", reorderName); - auto reorder_layout = std::make_shared(*reorderInDesc, *reorderOutDesc, reorderName, graph.getGraphContext()); + auto reorder_layout = + std::make_shared(*reorderInDesc, *reorderOutDesc, reorderName, graph.getGraphContext()); reorder_layout->setOptimized(isOptimized); reorder_layout->setSrcPermutation(srcPerm); @@ -2646,10 +2442,8 @@ void GraphOptimizer::mergeTransposeReshapeReorder(Graph& graph, Reorder::getReorderArgs(*reorderOutDesc, *finalDesc) + "_" + nodeAfterSequence->getName(); - reorder_last = std::make_shared(*reorderOutDesc, - *finalDesc, - reorderLayerName2, - graph.getGraphContext()); + reorder_last = + std::make_shared(*reorderOutDesc, *finalDesc, reorderLayerName2, graph.getGraphContext()); reorder_last->setOptimized(false); reorder_last->setSrcPermutation(srcPerm); graph.CreateEdge(reorder_layout, reorder_last, 0, 0); @@ -2703,10 +2497,10 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { return false; }; - return node->getType() == Type::Transpose - && node->getChildEdges().size() == 1 - && !node->isDynamicNode() // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is available (CVS-74863) - && !prevNodeIsConvSum(node); + return node->getType() == Type::Transpose && node->getChildEdges().size() == 1 && + !node->isDynamicNode() // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is + // available (CVS-74863) + && !prevNodeIsConvSum(node); }; auto isSuitableReshape = [](NodePtr node) { @@ -2731,8 +2525,9 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { }; auto isSuitableReorder = [](NodePtr node) { - return node->getType() == Type::Reorder - && !node->isDynamicNode(); // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is available (CVS-74863) + return node->getType() == Type::Reorder && + !node->isDynamicNode(); // TODO [DS]: enable for dynamic shapes when inPlace in the dynamic case is + // available (CVS-74863) }; auto updateOrder = [](const VectorDims& originalOrder, NodePtr reshape) { @@ -2800,17 +2595,28 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { const auto transposeNode = std::dynamic_pointer_cast(parentNode); const auto reorderNode = std::dynamic_pointer_cast(childNode); - std::shared_ptr reshapeNode = intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; + std::shared_ptr reshapeNode = + intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; if (!transposeNode || !reorderNode || (intermNode && !reshapeNode)) { continue; } auto transposeOrder = updateOrder(transposeNode->getOrder(), reshapeNode); - auto descBeforeReorder = reorderNode->getParentEdgeAt(0)->getParent()->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc(); + auto descBeforeReorder = reorderNode->getParentEdgeAt(0) + ->getParent() + ->getSelectedPrimitiveDescriptor() + ->getConfig() + .outConfs[0] + .getMemDesc(); auto layoutOrder = descBeforeReorder->as()->getOrder(); - auto inBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); - auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc()->as(); + auto inBlockedDesc = + reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); + auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor() + ->getConfig() + .outConfs[0] + .getMemDesc() + ->as(); auto& inOrder = inBlockedDesc->getOrder(); auto& outOrder = outBlockedDesc->getOrder(); @@ -2821,13 +2627,11 @@ void GraphOptimizer::MergeTransposeAndReorder(Graph& graph) { } } -void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { +void GraphOptimizer::MergeReorderAndTranspose(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableTranspose = [](NodePtr node) { - return node->getType() == Type::Transpose - && node->getChildEdges().size() == 1 - && !node->isDynamicNode(); + return node->getType() == Type::Transpose && node->getChildEdges().size() == 1 && !node->isDynamicNode(); }; auto isSuitableReshape = [](NodePtr node) { @@ -2917,7 +2721,8 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { auto transposeNode = std::dynamic_pointer_cast(childNode); auto reorderNode = std::dynamic_pointer_cast(parentNode); - std::shared_ptr reshapeNode = intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; + std::shared_ptr reshapeNode = + intermNode != nullptr ? std::dynamic_pointer_cast(intermNode) : nullptr; if (!transposeNode || !reorderNode || (intermNode && !reshapeNode)) { continue; } @@ -2926,15 +2731,20 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { auto descAfterTranspose = transposeNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc(); auto layoutOrder = updateOrder(descAfterTranspose->as()->getOrder(), reshapeNode); - auto inBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); - auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].getMemDesc()->as(); + auto inBlockedDesc = + reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->as(); + auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor() + ->getConfig() + .outConfs[0] + .getMemDesc() + ->as(); auto& inOrder = inBlockedDesc->getOrder(); auto& outOrder = outBlockedDesc->getOrder(); if (checkAscendingFinalOrder(transposeOrder, layoutOrder, inOrder, outOrder)) { - // Reorder node doesn't support (with rare exceptions) reordering in case of different ranks on input and output. - // So the merge can be performed only in the case when the fused reorder will be optimized. + // Reorder node doesn't support (with rare exceptions) reordering in case of different ranks on input and + // output. So the merge can be performed only in the case when the fused reorder will be optimized. if (parentNode->getInputShapeAtPort(0).getRank() != childNode->getOutputShapeAtPort(0).getRank() && !canBeInplaced(parentNode, childNode)) { continue; @@ -2944,14 +2754,15 @@ void GraphOptimizer::MergeReorderAndTranspose(Graph &graph) { } } -void GraphOptimizer::reshapeRnnSeq(Graph &graph) { +void GraphOptimizer::reshapeRnnSeq(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableParentNode = [](NodePtr node) { if (node->type != Type::RNNSeq) return false; auto rnnNode = std::dynamic_pointer_cast(node); - return rnnNode && !rnnNode->hasNativeOrder() && node->outputShapes[0].getRank() == 4 && node->outputShapes[0].getDims()[1] == 1; + return rnnNode && !rnnNode->hasNativeOrder() && node->outputShapes[0].getRank() == 4 && + node->outputShapes[0].getDims()[1] == 1; }; for (size_t i = 0; i < graphNodes.size(); i++) { @@ -2973,10 +2784,12 @@ void GraphOptimizer::reshapeRnnSeq(Graph &graph) { auto edge = childrenEdges[j]; auto childNode = edge->getChild(); - const auto secondInput = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); + const auto secondInput = + std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); const auto unsqueeze = std::make_shared( std::make_shared(parentNode->getOriginalOutputPrecisionAtPort(0), - parentNode->getOutputShapeAtPort(0).toPartialShape()), secondInput); + parentNode->getOutputShapeAtPort(0).toPartialShape()), + secondInput); unsqueeze->set_friendly_name(parentNode->getName() + "_abc_a1bc_" + std::to_string(j)); const auto cpuUnsqueeze = std::make_shared(unsqueeze, graph.getGraphContext()); @@ -3016,7 +2829,7 @@ void GraphOptimizer::RemoveSameConvert(Graph& graph) { } } -void GraphOptimizer::RemoveMemoryInputConvert(Graph &graph) { +void GraphOptimizer::RemoveMemoryInputConvert(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableNode = [](const NodePtr& node) { @@ -3042,7 +2855,7 @@ void GraphOptimizer::RemoveMemoryInputConvert(Graph &graph) { } } -void GraphOptimizer::RemoveConvertMemoryOutput(Graph &graph) { +void GraphOptimizer::RemoveConvertMemoryOutput(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableNode = [](const NodePtr& node) { @@ -3070,7 +2883,7 @@ void GraphOptimizer::RemoveConvertMemoryOutput(Graph &graph) { } } -void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { +void GraphOptimizer::MatchSdpaKvCache(Graph& graph) { auto& graphNodes = graph.GetNodes(); auto isSuitableMemInput = [](const NodePtr& node) -> bool { @@ -3087,7 +2900,7 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { if (Type::ScaledDotProductAttention == childNode->getType()) { if (childSdpa && childSdpa != childNode) { - //only one child SDPA supported + // only one child SDPA supported return false; } childSdpa = childNode; @@ -3130,7 +2943,7 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { input_prc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); } - //search for SDPA + // search for SDPA std::shared_ptr sdpa; for (auto&& edge : node->getChildEdgesAtPort(0)) { auto child = edge->getChild(); @@ -3144,19 +2957,18 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { } } - //capture reference to the original mem output before graph transformations + // capture reference to the original mem output before graph transformations auto& memOutput = memInputNode->getOutputNode(); - auto memInputSdpa = std::make_shared( - memInputNode->getId(), - memInputNode->getName(), - memInputNode->getTypeStr(), - memInputNode->getOutputShapeAtPort(0), - memInputNode->getOriginalOutputPrecisionAtPort(0), - graph.getGraphContext(), - input_shape, - input_prc, - sdpa); + auto memInputSdpa = std::make_shared(memInputNode->getId(), + memInputNode->getName(), + memInputNode->getTypeStr(), + memInputNode->getOutputShapeAtPort(0), + memInputNode->getOriginalOutputPrecisionAtPort(0), + graph.getGraphContext(), + input_shape, + input_prc, + sdpa); if (!memInputNode->getParentEdges().empty()) { auto parentEdge = memInputNode->getParentEdgeAt(0); @@ -3173,14 +2985,13 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { graph.CreateEdge(memInputSdpa, child, 0, outputNum); } - //create a stub memory output - auto memOutputStub = std::make_shared( - memOutput.getId(), - memOutput.getName(), - memOutput.getTypeStr(), - memOutput.getInputShapeAtPort(0), - memOutput.getOriginalInputPrecisionAtPort(0), - graph.getGraphContext()); + // create a stub memory output + auto memOutputStub = std::make_shared(memOutput.getId(), + memOutput.getName(), + memOutput.getTypeStr(), + memOutput.getInputShapeAtPort(0), + memOutput.getOriginalInputPrecisionAtPort(0), + graph.getGraphContext()); auto memOutputEdge = memOutput.getParentEdgeAt(0); const auto inputNum = memOutputEdge->getInputNum(); @@ -3192,7 +3003,7 @@ void GraphOptimizer::MatchSdpaKvCache(Graph &graph) { } } -void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { +void GraphOptimizer::DropRedundantMemoryOutput(Graph& graph) { // When we have a MemoryInput->MemoryOutput pair, that means that the state is immediately populated with the init // subgraph values when the init subgraph exists. In all the other cases the state is simply a read only object. // We can optimize such a case removing the MemoryOutput node and transferring the state values update @@ -3233,7 +3044,7 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { } if (MemoryOutput && MemoryOutput != childNode) { - //only one child MemoryOutput is expected + // only one child MemoryOutput is expected return false; } MemoryOutput = childNode; @@ -3261,7 +3072,7 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { inputPrc = ov::optional(node->getOriginalInputPrecisionAtPort(0)); } - //search for the MemoryOutputNode + // search for the MemoryOutputNode NodePtr memoryOutputNode; for (auto&& edge : node->getChildEdgesAtPort(0)) { auto child = edge->getChild(); @@ -3304,5 +3115,5 @@ void GraphOptimizer::DropRedundantMemoryOutput(Graph &graph) { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/graph_optimizer.h b/src/plugins/intel_cpu/src/graph_optimizer.h index 886296a7c0053b..90cf9c41c0907e 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.h +++ b/src/plugins/intel_cpu/src/graph_optimizer.h @@ -16,43 +16,42 @@ class GraphOptimizer { public: void ApplyCommonGraphOptimizations(Graph& graph); void ApplyImplSpecificGraphOptimizations(Graph& graph); - void ShareReorders(Graph &graph); + void ShareReorders(Graph& graph); private: - void FuseConvMatmulFCDeconvAndDQScales(Graph &graph); - void FuseFCAndWeightsDecompression(Graph &graph); - void FuseConvolutionMatMulDeconvAndBias(Graph &graph); - void FuseDeconvolutionAndSimpleOperation(Graph &graph); - void FuseMultiplyAndAdd(Graph &graph); + void FuseConvMatmulFCDeconvAndDQScales(Graph& graph); + void FuseConvolutionMatMulDeconvAndBias(Graph& graph); + void FuseDeconvolutionAndSimpleOperation(Graph& graph); + void FuseMultiplyAndAdd(Graph& graph); void MergeConvertAndScaleShift(Graph& graph); void FuseFCAndConvertOnWeights(Graph& graph); void FuseFCAndTransposeOnWeights(Graph& graph); - void FuseFullyConnectedAndSimpleOperation(Graph &graph); - void FuseMatMulAndSimpleOperation(Graph &graph); - void FuseConvolutionAndSimpleOperationThroughMaxPool(Graph &graph); - void FuseConvolutionAndSimpleOperation(Graph &graph); - void FuseConvolutionAndDWConvolution(Graph &graph); - void FusePoolingAndFakeQuantize(Graph &graph); - void FuseConvolutionSumAndConvolutionSumActivation(Graph &graph); - void FuseMVNAndSimpleOperation(Graph &graph); - void FuseInterpolateAndSimpleOperation(Graph &graph); - void FuseNormalizeL2AndSimpleOperation(Graph &graph); - void FuseReduceAndSimpleOperation(Graph &graph); + void FuseFullyConnectedAndSimpleOperation(Graph& graph); + void FuseMatMulAndSimpleOperation(Graph& graph); + void FuseConvolutionAndSimpleOperationThroughMaxPool(Graph& graph); + void FuseConvolutionAndSimpleOperation(Graph& graph); + void FuseConvolutionAndDWConvolution(Graph& graph); + void FusePoolingAndFakeQuantize(Graph& graph); + void FuseConvolutionSumAndConvolutionSumActivation(Graph& graph); + void FuseMVNAndSimpleOperation(Graph& graph); + void FuseInterpolateAndSimpleOperation(Graph& graph); + void FuseNormalizeL2AndSimpleOperation(Graph& graph); + void FuseReduceAndSimpleOperation(Graph& graph); void DropDoubleReorders(Graph& graph); - void FuseConvolutionAndZeroPoints(Graph &graph); - void FuseBroadcastAndEltwise(Graph &graph); - void FuseEltwiseAndSimple(Graph &graph); - void FusePerformedAsScaleShiftAndFakeQuantize(Graph &graph); - void FuseClampAndFakeQuantize(Graph &graph); - void MergeTransposeAndReorder(Graph &graph); - void MergeReorderAndTranspose(Graph &graph); - void reshapeRnnSeq(Graph &graph); - void RemoveSameConvert(Graph &graph); - void RemoveMemoryInputConvert(Graph &graph); - void RemoveConvertMemoryOutput(Graph &graph); - void MatchSdpaKvCache(Graph &graph); - void DropRedundantMemoryOutput(Graph &graph); + void FuseConvolutionAndZeroPoints(Graph& graph); + void FuseBroadcastAndEltwise(Graph& graph); + void FuseEltwiseAndSimple(Graph& graph); + void FusePerformedAsScaleShiftAndFakeQuantize(Graph& graph); + void FuseClampAndFakeQuantize(Graph& graph); + void MergeTransposeAndReorder(Graph& graph); + void MergeReorderAndTranspose(Graph& graph); + void reshapeRnnSeq(Graph& graph); + void RemoveSameConvert(Graph& graph); + void RemoveMemoryInputConvert(Graph& graph); + void RemoveConvertMemoryOutput(Graph& graph); + void MatchSdpaKvCache(Graph& graph); + void DropRedundantMemoryOutput(Graph& graph); bool canBeInplaced(const NodePtr& parentNode, const NodePtr& childNode); // Method checks that after the sequential execution of Transpose and Reorder nodes, @@ -69,19 +68,22 @@ class GraphOptimizer { // Examples: // 1. Direct order, no Reshape node. // Before: [N,C,H,W]abcd==>Transpose(0312)==>[N,W,C,H]abcd==>Reorder(abcd->acdb)==>[N,W,C,H]acdb - // [N,C,H,W]abcd is equivalent to the [N,W,C,H]acdb, so the Transpose and Reorder can be fused into single optimized Reorder: - // After: [N,C,H,W]abcd==>Reorder(abcd->acdb, isOptimized=true)==>[N,W,C,H]acdb + // [N,C,H,W]abcd is equivalent to the [N,W,C,H]acdb, so the Transpose and Reorder can be fused into single + // optimized Reorder: After: [N,C,H,W]abcd==>Reorder(abcd->acdb, isOptimized=true)==>[N,W,C,H]acdb // 2. Reverse order, no Reshape node. // Before: [N,W,C,H]acdb==>Reorder(acdb->abcd)==>[N,W,C,H]abcd==>Transpose(0231)==>[N,C,H,W]abcd - // [N,W,C,H]acdb is equivalent to the [N,C,H,W]abcd, so the Transpose and Reorder can be fused into single optimized Reorder: - // After: [N,W,C,H]acdb==>Reorder(acdb->abcd, isOptimized=true)==>[N,C,H,W]abcd + // [N,W,C,H]acdb is equivalent to the [N,C,H,W]abcd, so the Transpose and Reorder can be fused into single + // optimized Reorder: After: [N,W,C,H]acdb==>Reorder(acdb->abcd, isOptimized=true)==>[N,C,H,W]abcd // 3. Direct order with Reshape node (L = H x w). - // Before: [N,L,C]abc==>Transpose(021)==>[N,C,L]abc==>Reshape==>[N,C,H,W]abcd==>Reoder(abcd->acdb)==>[N,C,H,W]acdb - // After: [N,L,C]abc==>Reorder(abc->acdb, isOptimized=true)==>[N,C,H,W]acdb + // Before: + // [N,L,C]abc==>Transpose(021)==>[N,C,L]abc==>Reshape==>[N,C,H,W]abcd==>Reoder(abcd->acdb)==>[N,C,H,W]acdb After: + // [N,L,C]abc==>Reorder(abc->acdb, isOptimized=true)==>[N,C,H,W]acdb // 4. Reverse order with Reshape node (L = H x W). - // Before: [N,C,H,W]acdb==>Reorder(acdb->abcd)==>[N,C,H,W]abcd==>Reshape==>[N,C,L]abc==>Transpose(021)==>[N,L,C]abc + // Before: + // [N,C,H,W]acdb==>Reorder(acdb->abcd)==>[N,C,H,W]abcd==>Reshape==>[N,C,L]abc==>Transpose(021)==>[N,L,C]abc // After: [N,C,H,W]acdb==>Reorder(acdb->abc, isOptimized=true)==>[N,L,C]abc - // Note: in some cases (inplace conflicts or transpose with blocked input and non-blocked output) the merged Reorder can not be optimized. + // Note: in some cases (inplace conflicts or transpose with blocked input and non-blocked output) the merged Reorder + // can not be optimized. void mergeTransposeReshapeReorder(Graph& graph, const NodePtr& transposeNode, const NodePtr& reshapeNode, @@ -89,5 +91,5 @@ class GraphOptimizer { const bool reverseOrder); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index 26cdaf0860168a..3cfc34589623d2 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -8,17 +8,17 @@ #include "compiled_model.h" #include "dnnl_extension_utils.h" #include "itt.h" +#include "memory_desc/cpu_memory_desc_utils.h" #include "memory_state.h" #include "nodes/common/cpu_convert.h" -#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/memory_state_base.h" #include "openvino/core/shape.hpp" #include "openvino/runtime/make_tensor.hpp" #include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/threading/cpu_message.hpp" #include "proxy_mem_blk.h" #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" -#include "openvino/runtime/threading/cpu_message.hpp" using OvString = ov::element_type_traits::value_type; @@ -56,7 +56,7 @@ void SyncInferRequest::create_infer_request() { init_tensor(it.first, ov::ISyncInferRequest::FoundPort::Type::OUTPUT); } - //create states according to the list of the MemoryStateNodes + // create states according to the list of the MemoryStateNodes for (auto&& node : m_graph->getInternalStateNodes()) { m_memory_states.emplace_back(node.second->makeState()); } @@ -162,7 +162,7 @@ static inline void change_edge_ptr(const EdgePtr& edge, ov::SoPtr& OPENVINO_ASSERT(mem != nullptr, "Edge with name '", *edge, "' doesn't have allocated memory object."); if (tensor->get_element_type() == element::string) { - auto memBlock = dynamic_cast(mem.get())->getStringMemoryBlockPtr(); + auto memBlock = dynamic_cast(mem.get())->getStringMemoryBlockPtr(); OPENVINO_ASSERT(memBlock); memBlock->setExtBuff(tensor->data(), tensor->get_size()); } else { @@ -177,14 +177,14 @@ void SyncInferRequest::change_default_ptr() { const auto& outputNodesMap = m_graph->GetOutputNodesMap(); std::unordered_set inputPtrs; - std::function& tensor)> changeInpPtr; + std::function& tensor)> changeInpPtr; if (m_graph->IsDynamic()) { - changeInpPtr = [&inputPtrs](const EdgePtr &edge, ov::SoPtr& tensor) { + changeInpPtr = [&inputPtrs](const EdgePtr& edge, ov::SoPtr& tensor) { change_edge_ptr(edge, tensor); inputPtrs.insert(tensor->data()); }; } else { - changeInpPtr = [](const EdgePtr &edge, ov::SoPtr& tensor) { + changeInpPtr = [](const EdgePtr& edge, ov::SoPtr& tensor) { change_edge_ptr(edge, tensor); }; } @@ -279,7 +279,7 @@ void SyncInferRequest::change_default_ptr() { } if (m_graph->IsDynamic()) { - const auto &outMemBlocksMap = m_graph->getOutputNodesMemBlocksMap(); + const auto& outMemBlocksMap = m_graph->getOutputNodesMemBlocksMap(); for (auto&& item : outMemBlocksMap) { const auto& name = item.first; @@ -291,20 +291,32 @@ void SyncInferRequest::change_default_ptr() { if (controlBlockItr != m_outputControlBlocks.end()) { auto output = outputNodesMap.find(name); - OPENVINO_ASSERT(outputNodesMap.end() != output, "Node with name: ", name, " is absent in the outputNodesMap"); + OPENVINO_ASSERT(outputNodesMap.end() != output, + "Node with name: ", + name, + " is absent in the outputNodesMap"); auto parentEdge = output->second->getParentEdgeAt(0); - //avoid cyclic memory use + // avoid cyclic memory use auto&& controlBlock = controlBlockItr->second; - std::shared_ptr memBlock = inputPtrs.count(controlBlock.rawPtr()) ? // same memory is used on the input and output - controlBlock.nextMemBlock() : // then swap internal buffer to avoid data corruption - controlBlock.currentMemBlock(); // else reuse the existing buffer + std::shared_ptr memBlock = + inputPtrs.count(controlBlock.rawPtr()) ? // same memory is used on the input and output + controlBlock.nextMemBlock() + : // then swap internal buffer to avoid data corruption + controlBlock.currentMemBlock(); // else reuse the existing buffer outputMemBlock->setMemBlockResize(memBlock); - DEBUG_LOG("reset proxy ", outputMemBlock, ", actual ", controlBlock.currentMemBlock(), " graph ", m_graph, " inferrequest ", this); + DEBUG_LOG("reset proxy ", + outputMemBlock, + ", actual ", + controlBlock.currentMemBlock(), + " graph ", + m_graph, + " inferrequest ", + this); DEBUG_LOG(name, ", tensor ", controlBlock.tensor()); } else { - outputMemBlock->reset(); // switch to the internal memory since memory sharing is no longer possible + outputMemBlock->reset(); // switch to the internal memory since memory sharing is no longer possible } } } @@ -456,12 +468,13 @@ void SyncInferRequest::set_tensor(const ov::Output& in_port, con } m_outputs[output_index] = tensor; - m_outputControlBlocks.erase(output_index); // now the memory is under user's control + m_outputControlBlocks.erase(output_index); // now the memory is under user's control } ov::ISyncInferRequest::set_tensor(port, tensor); } -void SyncInferRequest::set_tensors_impl(const ov::Output port, const std::vector>& tensors) { +void SyncInferRequest::set_tensors_impl(const ov::Output port, + const std::vector>& tensors) { if (find_port(port).is_input()) { m_batched_tensors[port.get_tensor_ptr()] = tensors; return; @@ -541,7 +554,8 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn } dnnl::engine eng(dnnl::engine::kind::cpu, 0); - CpuBlockedMemoryDescPtr desc = std::make_shared(model_prec, Shape{memDims}); + CpuBlockedMemoryDescPtr desc = + std::make_shared(model_prec, Shape{memDims}); auto memory = std::make_shared(eng, desc); tensor = std::make_shared(memory); @@ -551,12 +565,12 @@ void SyncInferRequest::init_tensor(const std::size_t& port_index, const ov::ISyn OutputControlBlock control_block{model_prec, Shape{shape}}; DEBUG_LOG(port_index, - ", tensor ", - control_block.tensor(), - ", memBlock ", - control_block.tensor()->get_memory()->getMemoryBlock(), - "memory object ", - control_block.tensor()->get_memory().get()); + ", tensor ", + control_block.tensor(), + ", memBlock ", + control_block.tensor()->get_memory()->getMemoryBlock(), + "memory object ", + control_block.tensor()->get_memory().get()); tensor = control_block.tensor(); if (model_prec == graph_prec) @@ -602,7 +616,7 @@ SyncInferRequest::OutputControlBlock::OutputControlBlock(const ov::element::Type m_proxyMemBlock = std::make_shared(m_buffers[m_buffIndx]); VectorDims memDims; - if (shape.isDynamic()) { // this is a WA since the ITensor doesn't allow dyn shapes + if (shape.isDynamic()) { // this is a WA since the ITensor doesn't allow dyn shapes for (auto&& item : shape.getDims()) { memDims.push_back(item != Shape::UNDEFINED_DIM ? item : 0); } @@ -610,8 +624,7 @@ SyncInferRequest::OutputControlBlock::OutputControlBlock(const ov::element::Type memDims = shape.getStaticDims(); } - CpuBlockedMemoryDescPtr desc = - std::make_shared(precision, Shape{memDims}); + CpuBlockedMemoryDescPtr desc = std::make_shared(precision, Shape{memDims}); auto memory = std::make_shared(eng, desc, m_proxyMemBlock); m_tensor = std::make_shared(memory); @@ -649,6 +662,5 @@ void SyncInferRequest::sub_streams_infer() { } } -} // namespace intel_cpu -} // namespace ov - +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/infer_request.h b/src/plugins/intel_cpu/src/infer_request.h index a9def63d359744..b66387ecc4d4d5 100644 --- a/src/plugins/intel_cpu/src/infer_request.h +++ b/src/plugins/intel_cpu/src/infer_request.h @@ -4,11 +4,11 @@ #pragma once -#include "graph.h" #include "cpu_tensor.h" +#include "graph.h" +#include "memory_state.h" #include "openvino/runtime/iinfer_request.hpp" #include "openvino/runtime/isync_infer_request.hpp" -#include "memory_state.h" namespace ov { namespace intel_cpu { @@ -29,7 +29,8 @@ class SyncInferRequest : public ov::ISyncInferRequest { void set_tensor(const ov::Output& port, const ov::SoPtr& tensor) override; - void set_tensors_impl(const ov::Output port, const std::vector>& tensors) override; + void set_tensors_impl(const ov::Output port, + const std::vector>& tensors) override; ov::SoPtr get_tensor(const ov::Output& port) const override; std::vector> get_tensors(const ov::Output& _port) const override; diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp index 0f202c296891c1..26cd8459458b9d 100644 --- a/src/plugins/intel_cpu/src/memory_control.cpp +++ b/src/plugins/intel_cpu/src/memory_control.cpp @@ -16,8 +16,7 @@ namespace { class StaticPartitionMemoryBlock : public IMemoryBlockObserver { public: - StaticPartitionMemoryBlock(MemoryBlockPtr pBlock, ptrdiff_t offset) - : m_pBlock(pBlock), m_offset(offset) { + StaticPartitionMemoryBlock(MemoryBlockPtr pBlock, ptrdiff_t offset) : m_pBlock(pBlock), m_offset(offset) { OPENVINO_ASSERT(m_pBlock, "Memory block is uninitialized"); } @@ -92,7 +91,7 @@ class IMemoryManager { using MemoryManagerPtr = std::shared_ptr; -template +template std::shared_ptr makeDnnlMemoryBlock(Args&&... args) { return std::make_shared(make_unique(std::forward(args)...)); } @@ -152,10 +151,12 @@ class MemoryManagerStatic : public IMemoryManager { } void allocate() override { - if (m_workspace) m_workspace->resize(m_totalSize); + if (m_workspace) + m_workspace->resize(m_totalSize); } void release() override { - if (m_workspace) m_workspace->free(); + if (m_workspace) + m_workspace->free(); } private: @@ -171,14 +172,13 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { void insert(const MemoryRegion& reg) override { MemorySolver::Box box = {reg.start, reg.finish, reg.size, reg.id}; if (-1 != reg.finish) { - //We have to extend the lifespan of tensors that are crossing a sync point border in order to save - //the intermediate computation results from possible loss due to the tensor resize - auto itr_upper = - std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { - return y <= x; - }); + // We have to extend the lifespan of tensors that are crossing a sync point border in order to save + // the intermediate computation results from possible loss due to the tensor resize + auto itr_upper = std::upper_bound(m_syncInds.begin(), m_syncInds.end(), box.finish, [](int y, int x) { + return y <= x; + }); auto itr_lower = std::lower_bound(m_syncInds.begin(), m_syncInds.end(), box.start); - if (itr_lower != itr_upper) { // across sections + if (itr_lower != itr_upper) { // across sections if (itr_upper == m_syncInds.end()) { box.finish = -1; } else { @@ -201,7 +201,7 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { void solve() { ov::MemorySolver::normalize_boxes(m_boxes); - std::vector> groups; //groups of nonoverlapping boxes + std::vector> groups; // groups of nonoverlapping boxes groups.push_back({m_boxes.front()}); for (size_t i = 1; i < m_boxes.size(); ++i) { const auto& box = m_boxes[i]; @@ -229,7 +229,7 @@ class MemoryManageNonOverlapingSets : public IMemoryManager { } void allocate() override { - //nothing to do + // nothing to do } void release() override { for (auto&& item : m_internalBlocks) { @@ -305,15 +305,17 @@ MemoryControl::MemoryControl(std::vector syncInds) { })); // handler for static tensors - m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { - if (reg.size >= 0 || MemoryRegion::RegionType::VARIABLE != reg.type || - MemoryRegion::AllocType::POD != reg.alloc_type) { - return false; - } - return true; - }, std::move(syncInds))); + m_handlers.emplace_back(buildHandler( + [](const MemoryRegion& reg) { + if (reg.size >= 0 || MemoryRegion::RegionType::VARIABLE != reg.type || + MemoryRegion::AllocType::POD != reg.alloc_type) { + return false; + } + return true; + }, + std::move(syncInds))); - //handler for I/O tensors, so far simply individual blocks + // handler for I/O tensors, so far simply individual blocks m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { if (MemoryRegion::RegionType::VARIABLE == reg.type || reg.alloc_type != MemoryRegion::AllocType::POD) { return false; diff --git a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp index 4b75d5c5263398..7dff6905df09d9 100644 --- a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.cpp @@ -15,9 +15,9 @@ namespace intel_cpu { constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::FULL_MASK; constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::EMPTY_MASK; constexpr BlockedMemoryDesc::CmpMask BlockedMemoryDesc::SKIP_OFFSET_MASK; -constexpr size_t BlockedMemoryDesc::OFFSET_MASK_POS; +constexpr size_t BlockedMemoryDesc::OFFSET_MASK_POS; -bool BlockedMemoryDesc::isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool BlockedMemoryDesc::isCompatibleInternal(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const { if (this->getShape() != rhs.getShape() || this->getPrecision() != rhs.getPrecision()) return false; @@ -77,5 +77,5 @@ std::string BlockedMemoryDesc::serializeFormat() const { return result.str(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h index d938a4ba585602..9ff132965bdc0b 100644 --- a/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/blocked_memory_desc.h @@ -21,7 +21,7 @@ class BlockedMemoryDesc : public virtual MemoryDesc { static constexpr CmpMask FULL_MASK{0xffffffff}; static constexpr CmpMask EMPTY_MASK{0x0}; static constexpr CmpMask SKIP_OFFSET_MASK{0x7fffffff}; - static constexpr size_t OFFSET_MASK_POS{31}; + static constexpr size_t OFFSET_MASK_POS{31}; /** * @brief Returns the blocked dimensions @@ -73,7 +73,7 @@ class BlockedMemoryDesc : public virtual MemoryDesc { * * @return the result of the compatibility check */ - virtual bool isCompatible(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const = 0; + virtual bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const = 0; using MemoryDesc::isCompatible; ~BlockedMemoryDesc() override = default; @@ -88,7 +88,7 @@ class BlockedMemoryDesc : public virtual MemoryDesc { * Doesn't perform descs specific attributes check * @return true if compatible, otherwise false */ - bool isCompatibleInternal(const BlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; + bool isCompatibleInternal(const BlockedMemoryDesc& rhs, CmpMask cmpMask = FULL_MASK) const; mutable VectorDims blockedDims; mutable VectorDims strides; @@ -99,5 +99,5 @@ class BlockedMemoryDesc : public virtual MemoryDesc { using BlockedMemoryDescPtr = std::shared_ptr; using BlockedMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp index d1c50d0048c57d..c95463207a9c46 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp @@ -3,6 +3,7 @@ // #include "cpu_blocked_memory_desc.h" + #include "dnnl_blocked_memory_desc.h" #include "utils/general_utils.h" @@ -15,17 +16,27 @@ static VectorDims makeRange(size_t size) { return retVec; } -CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape) : - CpuBlockedMemoryDesc(prc, shape, shape.getDims(), makeRange(shape.getDims().size())) {} - -CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding, const VectorDims& offsetPaddingToData, - const VectorDims& strides) : MemoryDesc(shape, Blocked), precision(prc) { - if (std::any_of(order.begin(), order.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { +CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape) + : CpuBlockedMemoryDesc(prc, shape, shape.getDims(), makeRange(shape.getDims().size())) {} + +CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding, + const VectorDims& offsetPaddingToData, + const VectorDims& strides) + : MemoryDesc(shape, Blocked), + precision(prc) { + if (std::any_of(order.begin(), order.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { OPENVINO_THROW("CpuBlockedMemoryDesc do not support undefined order."); } - if (std::any_of(blockedDims.begin() + shape.getRank(), blockedDims.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { + if (std::any_of(blockedDims.begin() + shape.getRank(), blockedDims.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { OPENVINO_THROW("CpuBlockedMemoryDesc doesn't support undefined blockedDims."); } @@ -51,29 +62,43 @@ CpuBlockedMemoryDesc::CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& s if (strides.empty() && !order.empty()) { if (shape.hasZeroDims()) { this->strides.resize(order.size(), 0); - } else if (std::any_of(this->blockedDims.begin(), this->blockedDims.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { + } else if (std::any_of(this->blockedDims.begin(), this->blockedDims.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { this->strides.resize(order.size(), Shape::UNDEFINED_DIM); } else { this->strides.resize(order.size(), 1); for (size_t i = 2; i <= order.size(); i++) { - this->strides[order.size() - i] = this->strides[order.size() - (i - 1)] * this->blockedDims[blockedDims.size() - (i - 1)]; + this->strides[order.size() - i] = + this->strides[order.size() - (i - 1)] * this->blockedDims[blockedDims.size() - (i - 1)]; } } } else { this->strides = strides; } - if (!everyone_is(this->order.size(), this->blockedDims.size(), this->offsetPaddingToData.size(), this->strides.size())) { + if (!everyone_is(this->order.size(), + this->blockedDims.size(), + this->offsetPaddingToData.size(), + this->strides.size())) { OPENVINO_THROW("Order, blocked dims, offset padding to data and strides must have equals size"); } } bool CpuBlockedMemoryDesc::isDefinedImp() const { bool defined = true; - defined = defined && std::none_of(blockedDims.cbegin(), blockedDims.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); - defined = defined && std::none_of(strides.cbegin(), strides.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); - defined = defined && std::none_of(order.cbegin(), order.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); - defined = defined && std::none_of(offsetPaddingToData.cbegin(), offsetPaddingToData.cend(), [](size_t val) { return val == Shape::UNDEFINED_DIM; }); + defined = defined && std::none_of(blockedDims.cbegin(), blockedDims.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); + defined = defined && std::none_of(strides.cbegin(), strides.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); + defined = defined && std::none_of(order.cbegin(), order.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); + defined = defined && std::none_of(offsetPaddingToData.cbegin(), offsetPaddingToData.cend(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + }); defined = defined && offsetPadding != Shape::UNDEFINED_DIM; return defined; @@ -90,15 +115,15 @@ bool CpuBlockedMemoryDesc::isCompatible(const MemoryDesc& rhs) const { } } -bool CpuBlockedMemoryDesc::isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool CpuBlockedMemoryDesc::isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask) const { return BlockedMemoryDesc::isCompatibleInternal(rhs, cmpMask); } -bool CpuBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool CpuBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask) const { return rhs.isCompatible(*this, cmpMask); } -bool CpuBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool CpuBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const { const BlockedMemoryDesc* pRhs = &rhs; if (auto cpuBlkDesc = dynamic_cast(pRhs)) { return isCompatible(*cpuBlkDesc, cmpMask); @@ -149,7 +174,9 @@ size_t CpuBlockedMemoryDesc::getMaxMemSize() const { } const auto& maxDims = shape.getMaxDims(); - if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { + if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { return UNDEFINED_SIZE; } @@ -193,16 +220,16 @@ size_t CpuBlockedMemoryDesc::getElementOffset(size_t elemNumber) const { bool CpuBlockedMemoryDesc::hasLayoutType(LayoutType layoutType) const { switch (layoutType) { - case LayoutType::ncsp: - return isPlainFormat(); - case LayoutType::nspc: - return isTailCFormat(); - case LayoutType::nCsp8c: - return isBlockedCFormat(8); - case LayoutType::nCsp16c: - return isBlockedCFormat(16); - default: - return false; + case LayoutType::ncsp: + return isPlainFormat(); + case LayoutType::nspc: + return isTailCFormat(); + case LayoutType::nCsp8c: + return isBlockedCFormat(8); + case LayoutType::nCsp16c: + return isBlockedCFormat(16); + default: + return false; } } @@ -252,13 +279,15 @@ bool CpuBlockedMemoryDesc::isTailCFormat() const { return true; } -MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) const { - if (std::any_of(dims.begin(), dims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { +MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims& dims) const { + if (std::any_of(dims.begin(), dims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { OPENVINO_THROW("Can't clone desc if new dims are undefined"); } // TODO [DS]: add stride recalculation for strided blobs - for (int i = strides.size() - 2; i >= 0 ; i--) { + for (int i = strides.size() - 2; i >= 0; i--) { if (strides[i] == Shape::UNDEFINED_DIM) break; @@ -280,11 +309,18 @@ MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) } VectorDims newOffsetPaddingToData; - if (std::none_of(offsetPaddingToData.begin(), offsetPaddingToData.end(), [](size_t x){ return x == Shape::UNDEFINED_DIM;})) { + if (std::none_of(offsetPaddingToData.begin(), offsetPaddingToData.end(), [](size_t x) { + return x == Shape::UNDEFINED_DIM; + })) { newOffsetPaddingToData = offsetPaddingToData; } - return std::make_shared(precision, Shape(dims), newBlockedDims, order, offsetPadding, newOffsetPaddingToData); + return std::make_shared(precision, + Shape(dims), + newBlockedDims, + order, + offsetPadding, + newOffsetPaddingToData); } bool CpuBlockedMemoryDesc::blocksExtended() const { @@ -311,7 +347,9 @@ size_t CpuBlockedMemoryDesc::getPaddedElementsCount() const { if (getShape().hasZeroDims()) { return 0; } - if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim dim) { return dim == Shape::UNDEFINED_DIM; })) { + if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim dim) { + return dim == Shape::UNDEFINED_DIM; + })) { OPENVINO_THROW("Can't compute padded elements count for non undefined blocked dims"); } return std::accumulate(blockedDims.begin(), blockedDims.end(), size_t{1}, std::multiplies()); @@ -323,5 +361,5 @@ MemoryDescPtr CpuBlockedMemoryDesc::cloneWithNewPrecision(const ov::element::Typ return newDesc; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h index 28badb4dac15fb..fdf931a262e854 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.h @@ -16,8 +16,12 @@ class CpuBlockedMemoryDesc : public BlockedMemoryDesc { public: CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape); - CpuBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding = 0, const VectorDims& offsetPaddingToData = {}, + CpuBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding = 0, + const VectorDims& offsetPaddingToData = {}, const VectorDims& strides = {}); MemoryDescPtr clone() const override { @@ -26,8 +30,8 @@ class CpuBlockedMemoryDesc : public BlockedMemoryDesc { bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const override; - bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; - bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; + bool isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; + bool isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask = BlockedMemoryDesc::FULL_MASK) const; ov::element::Type getPrecision() const override { return precision; @@ -105,5 +109,5 @@ class CpuBlockedMemoryDesc : public BlockedMemoryDesc { using CpuBlockedMemoryDescPtr = std::shared_ptr; using CpuBlockedMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h index c3936528abed7b..e6d260066118ee 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc.h @@ -22,7 +22,7 @@ namespace ov { namespace intel_cpu { namespace node { class Split; -} // namespace node +} // namespace node class MemoryDesc; @@ -39,10 +39,10 @@ enum MemoryDescType { }; enum class LayoutType : unsigned { - nspc, // general per channels format - ncsp, // general planar - nCsp8c, // general channels blocked by 8 - nCsp16c // general channels blocked by 16 + nspc, // general per channels format + ncsp, // general planar + nCsp8c, // general channels blocked by 8 + nCsp16c // general channels blocked by 16 }; class MemoryDesc { @@ -70,8 +70,8 @@ class MemoryDesc { /** * @brief Clone descriptor with new dims. - * Throws an exception if relaxedCheck is false and some of the new dims conflicts with the internal shape (i.e. its defined dims ,rank, upper bounds) - * or if internal shape and dims have different ranks + * Throws an exception if relaxedCheck is false and some of the new dims conflicts with the internal shape (i.e. its + * defined dims ,rank, upper bounds) or if internal shape and dims have different ranks * @param dims new dims * @param relaxedCheck flag which defined must we check dims with internal desc on compatibility * @return MemoryDescPtr with new dims @@ -136,8 +136,8 @@ class MemoryDesc { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> T* as() { T* casted = dynamic_cast(this); if (!casted) @@ -146,8 +146,8 @@ class MemoryDesc { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> const T* as() const { const T* casted = dynamic_cast(this); if (!casted) @@ -159,17 +159,16 @@ class MemoryDesc { protected: MemoryDesc() : type(MemoryDescType::Undef) {} - MemoryDesc(Shape shape, MemoryDescType type) - : type(type), shape(std::move(shape)) {} + MemoryDesc(Shape shape, MemoryDescType type) : type(type), shape(std::move(shape)) {} - MemoryDesc(const VectorDims& dims, MemoryDescType type) - : type(type), shape(dims) {} + MemoryDesc(const VectorDims& dims, MemoryDescType type) : type(type), shape(dims) {} virtual void setPrecision(ov::element::Type prc) = 0; virtual size_t getCurrentMemSizeImp() const = 0; - // Get offset to the n'th element. Returns physical index of the element by the logical one considering padding, layout, blocking etc. + // Get offset to the n'th element. Returns physical index of the element by the logical one considering padding, + // layout, blocking etc. virtual size_t getElementOffset(size_t elemNumber) const = 0; virtual bool canComputeMemSizeZeroDims() const = 0; @@ -195,5 +194,5 @@ class MemoryDesc { friend class node::Split; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp index 0ae17d6c00322b..2937b73409b67d 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp @@ -4,29 +4,33 @@ #include "memory_desc/cpu_memory_desc_utils.h" -#include "memory_desc/cpu_blocked_memory_desc.h" -#include "memory_desc/dnnl_blocked_memory_desc.h" -#include "graph_context.h" -#include "cpu_memory_desc.h" -#include "memory_desc/empty_memory_desc.h" -#include -#include #include #include + #include #include +#include "cpu_memory_desc.h" +#include "graph_context.h" +#include "memory_desc/cpu_blocked_memory_desc.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "memory_desc/empty_memory_desc.h" + using namespace dnnl; namespace ov { namespace intel_cpu { -DnnlMemoryDescPtr MemoryDescUtils::convertToDnnlMemoryDesc(const MemoryDescPtr &desc) { +DnnlMemoryDescPtr MemoryDescUtils::convertToDnnlMemoryDesc(const MemoryDescPtr& desc) { if (MemoryDescType::Blocked == desc->getType()) { const auto cpuDesc = desc->as(); - return std::shared_ptr(new DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), cpuDesc->getShape(), cpuDesc->getBlockDims(), - cpuDesc->getOrder(), cpuDesc->getOffsetPadding(), - cpuDesc->getOffsetPaddingToData(), cpuDesc->getStrides())); + return std::shared_ptr(new DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), + cpuDesc->getShape(), + cpuDesc->getBlockDims(), + cpuDesc->getOrder(), + cpuDesc->getOffsetPadding(), + cpuDesc->getOffsetPaddingToData(), + cpuDesc->getStrides())); } else if (MemoryDescType::Empty == desc->getType()) { return DnnlExtensionUtils::makeDescriptor(dnnl::memory::desc()); } else if (MemoryDescType::Dnnl & desc->getType()) { @@ -41,14 +45,19 @@ DnnlBlockedMemoryDesc MemoryDescUtils::convertToDnnlBlockedMemoryDesc(const Memo return DnnlBlockedMemoryDesc(*desc.as()); } else if (MemoryDescType::Blocked == desc.getType()) { const auto cpuDesc = desc.as(); - return DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), cpuDesc->getShape(), cpuDesc->getBlockDims(), cpuDesc->getOrder(), cpuDesc->getOffsetPadding(), - cpuDesc->getOffsetPaddingToData(), cpuDesc->getStrides()); + return DnnlBlockedMemoryDesc(cpuDesc->getPrecision(), + cpuDesc->getShape(), + cpuDesc->getBlockDims(), + cpuDesc->getOrder(), + cpuDesc->getOffsetPadding(), + cpuDesc->getOffsetPaddingToData(), + cpuDesc->getStrides()); } else { OPENVINO_THROW("Cannot convert MemoryDesc to DnnlBlockedMemoryDesc"); } } -BlockedMemoryDescPtr MemoryDescUtils::convertToBlockedMemoryDesc(const MemoryDescPtr &desc) { +BlockedMemoryDescPtr MemoryDescUtils::convertToBlockedMemoryDesc(const MemoryDescPtr& desc) { if (desc->getType() & MemoryDescType::Blocked) { return std::dynamic_pointer_cast(desc); } else { @@ -57,7 +66,7 @@ BlockedMemoryDescPtr MemoryDescUtils::convertToBlockedMemoryDesc(const MemoryDes } CpuBlockedMemoryDescPtr MemoryDescUtils::generateCpuBlockedMemoryDesc(const ov::SoPtr& tensor) { - const auto& shape = tensor->get_shape().empty() ? ov::Shape{tensor->get_size()} : tensor->get_shape(); + const auto& shape = tensor->get_shape().empty() ? ov::Shape{tensor->get_size()} : tensor->get_shape(); VectorDims blk_order(shape.size()); std::iota(blk_order.begin(), blk_order.end(), 0); @@ -87,17 +96,16 @@ CpuBlockedMemoryDescPtr MemoryDescUtils::generateCpuBlockedMemoryDesc(const ov:: }); } - return std::make_shared( - element_type, - Shape{shape}, - shape, - blk_order, - 0UL, - VectorDims{}, - blk_strides); + return std::make_shared(element_type, + Shape{shape}, + shape, + blk_order, + 0UL, + VectorDims{}, + blk_strides); } -std::shared_ptr MemoryDescUtils::makeDummyDesc(const MemoryDesc &desc, Dim dummyVal) { +std::shared_ptr MemoryDescUtils::makeDummyDesc(const MemoryDesc& desc, Dim dummyVal) { auto dummyShape = makeDummyShape(desc.getShape(), dummyVal); return desc.cloneWithNewDims(dummyShape.getStaticDims()); } @@ -111,7 +119,7 @@ std::shared_ptr MemoryDescUtils::makeEmptyMemory(const GraphContext::CP return std::make_shared(context->getEngine(), makeEmptyDesc(), nullptr); } -Shape MemoryDescUtils::makeDummyShape(const Shape &shape, Dim dummyVal) { +Shape MemoryDescUtils::makeDummyShape(const Shape& shape, Dim dummyVal) { const auto& minDims = shape.getMinDims(); const auto& maxDims = shape.getMaxDims(); const auto& dims = shape.getDims(); @@ -122,7 +130,7 @@ Shape MemoryDescUtils::makeDummyShape(const Shape &shape, Dim dummyVal) { return Shape(dummyDims); } -Shape MemoryDescUtils::makeDummyShape(const Shape &shape, const VectorDims& dummyVals) { +Shape MemoryDescUtils::makeDummyShape(const Shape& shape, const VectorDims& dummyVals) { if (shape.getRank() != dummyVals.size()) { OPENVINO_THROW("makeDummyShape(): dummyVals vector size and shape ranks mismatch"); } @@ -131,9 +139,10 @@ Shape MemoryDescUtils::makeDummyShape(const Shape &shape, const VectorDims& dumm const auto& dims = shape.getDims(); VectorDims dummyDims(dims.size()); for (size_t i = 0; i < dims.size(); ++i) { - dummyDims[i] = dims[i] == Shape::UNDEFINED_DIM ? std::min(maxDims[i], std::max(minDims[i], dummyVals[i])) : dims[i]; + dummyDims[i] = + dims[i] == Shape::UNDEFINED_DIM ? std::min(maxDims[i], std::max(minDims[i], dummyVals[i])) : dims[i]; } return Shape(dummyDims); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h index a4acd3eb2aa778..388c9a21c5df8e 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h @@ -5,11 +5,12 @@ #pragma once #include + #include "cpu_shape.h" #include "cpu_types.h" +#include "graph_context.h" #include "openvino/runtime/itensor.hpp" #include "openvino/runtime/so_ptr.hpp" -#include "graph_context.h" namespace ov { namespace intel_cpu { @@ -32,7 +33,7 @@ class MemoryDescUtils { * @param desc MemoryDesc to be converted * @return converted DnnlMemoryDesc */ - static std::shared_ptr convertToDnnlMemoryDesc(const std::shared_ptr &desc); + static std::shared_ptr convertToDnnlMemoryDesc(const std::shared_ptr& desc); /** * @brief Converts MemoryDesc to DnnlBlockedMemoryDesc @@ -46,7 +47,7 @@ class MemoryDescUtils { * @param desc MemoryDesc to be converted * @return converted BlockedMemoryDesc */ - static std::shared_ptr convertToBlockedMemoryDesc(const std::shared_ptr &desc); + static std::shared_ptr convertToBlockedMemoryDesc(const std::shared_ptr& desc); /** * @brief Builds CpuBlockedMemoryDesc for given ov::ITensor @@ -58,7 +59,8 @@ class MemoryDescUtils { static constexpr Dim DEFAULT_DUMMY_VAL = 64; /** - * @brief Makes a dummy descriptor where all undefined values are replaced with the smallest value between the parameter and the upper bound dim + * @brief Makes a dummy descriptor where all undefined values are replaced with the smallest value between the + * parameter and the upper bound dim * @param desc MemoryDesc from which the new descriptor is generated * @param dummyVal Dim value to replace undefined dimensions * @return a new MemoryDesc with dummy values instead of undefined dims @@ -66,27 +68,29 @@ class MemoryDescUtils { static std::shared_ptr makeDummyDesc(const MemoryDesc& desc, Dim dummyVal = DEFAULT_DUMMY_VAL); /** - * @brief Make an empty memory descriptor - * @note Shape{0}, undefined - * @return empty memory descriptor - */ + * @brief Make an empty memory descriptor + * @note Shape{0}, undefined + * @return empty memory descriptor + */ static std::shared_ptr makeEmptyDesc(); static std::shared_ptr makeEmptyMemory(const GraphContext::CPtr context); /** - * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the parameter and the upper bound dim - * @param shape a Shape object from which the new static shape is generated - * @param dummyVal Dim value to replace undefined dimensions - * @return a new Shape with dummy values instead of undefined dims - */ + * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the + * parameter and the upper bound dim + * @param shape a Shape object from which the new static shape is generated + * @param dummyVal Dim value to replace undefined dimensions + * @return a new Shape with dummy values instead of undefined dims + */ static Shape makeDummyShape(const Shape& shape, Dim dummyVal = DEFAULT_DUMMY_VAL); /** - * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the parameter and the upper bound dim - * @param shape a Shape object from which the new static shape is generated - * @param dummyVals vector of values to replace undefined dimensions - * @return a new Shape with dummy values instead of undefined dims - */ + * @brief Makes a static dummy shape where all undefined values are replaced with the smallest value between the + * parameter and the upper bound dim + * @param shape a Shape object from which the new static shape is generated + * @param dummyVals vector of values to replace undefined dimensions + * @return a new Shape with dummy values instead of undefined dims + */ static Shape makeDummyShape(const Shape& shape, const VectorDims& dummyVals); /** @@ -104,5 +108,5 @@ class MemoryDescUtils { static std::string dims2str(const VectorDims& dims); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp index a24b55831c2c7c..38c020674c7168 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp @@ -4,26 +4,28 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" +#include #include +#include #include + #include "cpu_types.h" #include "dnnl_extension_utils.h" #include "memory_desc/cpu_blocked_memory_desc.h" #include "utils/general_utils.h" -#include -#include - namespace ov { namespace intel_cpu { DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& strides) : MemoryDesc(shape, DnnlBlocked) { const auto ndims = shape.getRank(); - const auto &dims = shape.getDims(); + const auto& dims = shape.getDims(); - if (!strides.empty()) { // custom strides - if (shape.hasZeroDims() && std::any_of(strides.begin(), strides.end(), [](size_t stride) { return stride != 0; } )) { + if (!strides.empty()) { // custom strides + if (shape.hasZeroDims() && std::any_of(strides.begin(), strides.end(), [](size_t stride) { + return stride != 0; + })) { OPENVINO_THROW("Can't create DnnlBlockedMemoryDesc with zero dim, but with non zero strides"); } desc = {DnnlExtensionUtils::convertToDnnlDims(dims), @@ -33,16 +35,20 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& dnnl::memory::dims plain_strides; if (shape.hasZeroDims()) { plain_strides.resize(ndims, 0); - } else if (std::any_of(dims.begin(), dims.end(), [](size_t val) { return val == Shape::UNDEFINED_DIM; })) { + } else if (std::any_of(dims.begin(), dims.end(), [](size_t val) { + return val == Shape::UNDEFINED_DIM; + })) { plain_strides.resize(ndims, DNNL_RUNTIME_DIM_VAL); } else { plain_strides.resize(ndims, 1); for (size_t i = 1; i < ndims; i++) { - plain_strides[ndims - i -1] = plain_strides[ndims - i] * dims[ndims - i]; + plain_strides[ndims - i - 1] = plain_strides[ndims - i] * dims[ndims - i]; } } - desc = {DnnlExtensionUtils::convertToDnnlDims(dims), DnnlExtensionUtils::ElementTypeToDataType(prc), plain_strides}; + desc = {DnnlExtensionUtils::convertToDnnlDims(dims), + DnnlExtensionUtils::ElementTypeToDataType(prc), + plain_strides}; } order.resize(ndims); @@ -55,11 +61,12 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& * Construct from blocked parameters * * OV IOhw_4i16o4i dims(N) = {32, 64, 128, 128} - * blockedDims {4, 2, 128, 128, 4, 16, 4} // total dims(inner, outermost, auto blocked/padded). Generally sorted by strides. - * strides {8388608, 4194304, 32768, 256, 64, 4, 1} // strides for blockedDims, growing sequence - * order {1, 0, 2, 3, 1, 0, 1} // matching to original dims + * blockedDims {4, 2, 128, 128, 4, 16, 4} // total dims(inner, outermost, auto blocked/padded). + * Generally sorted by strides. strides {8388608, 4194304, 32768, 256, 64, 4, 1} // strides for blockedDims, + * growing sequence order {1, 0, 2, 3, 1, 0, 1} // matching to original dims * - * All vectors blockedDims/strides/order have same size equals total num of internal blocked dims(inner_dims + outer_dims) + * All vectors blockedDims/strides/order have same size equals total num of internal blocked dims(inner_dims + + * outer_dims) * * Tensor descriptor filing is not deterministic. It allows any permutation of index which keeps order of * real dims spliting. @@ -70,9 +77,14 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& * * Limitation of conversion first N elements of order should be permutation of [0,1,2 ... N] */ -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding, const VectorDims& offsetPaddingToData, - const VectorDims& strides) : MemoryDesc(shape, DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding, + const VectorDims& offsetPaddingToData, + const VectorDims& strides) + : MemoryDesc(shape, DnnlBlocked) { using namespace dnnl; // scalar case if (shape.getRank() == 0) { @@ -128,7 +140,9 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& const bool emptyDesc = shape.hasZeroDims(); if (!strides.empty()) { - if (emptyDesc && std::any_of(strides.begin(), strides.end(), [](size_t dim) { return dim != 0; } )) { + if (emptyDesc && std::any_of(strides.begin(), strides.end(), [](size_t dim) { + return dim != 0; + })) { OPENVINO_THROW("Can't create DnnlBlockedMemoryDesc with zero dim, but with non zero strides"); } @@ -143,7 +157,9 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& OPENVINO_THROW("Can not construct DnnlBlockedMemoryDesc from strides: ", vec2str(strides)); } - if (!strides.empty() && !emptyDesc && std::none_of(strides.begin(), strides.end(), [](size_t x) { return Shape::UNDEFINED_DIM == x; })) { + if (!strides.empty() && !emptyDesc && std::none_of(strides.begin(), strides.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { bool inner_block_are_dense = one_of(strides.back(), 0u, 1u); // stride 1 - is dense case, 0 - broad casted for (size_t i = outer_ndims; i < strides.size() - 1; i++) { inner_block_are_dense &= (strides[i] == strides[i + 1] * blockedDims[i + 1]); @@ -164,8 +180,10 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& std::copy(dims.begin(), dims.end(), desc.get()->dims); if (!offsetPaddingToData.empty()) { - bool inner_pad_offsets_is_zero = std::all_of(offsetPaddingToData.begin() + outer_ndims, offsetPaddingToData.end(), - [](size_t pad) { return pad == 0; }); + bool inner_pad_offsets_is_zero = + std::all_of(offsetPaddingToData.begin() + outer_ndims, offsetPaddingToData.end(), [](size_t pad) { + return pad == 0; + }); if (!inner_pad_offsets_is_zero) OPENVINO_THROW("Can not construct DnnlBlockedMemoryDesc, inner pad offsets is not zero: ", @@ -189,7 +207,7 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& } // Fill blocking desc - auto &dnn_blk_desc = desc.get()->format_desc.blocking; + auto& dnn_blk_desc = desc.get()->format_desc.blocking; dnn_blk_desc.inner_nblks = inner_ndims; std::copy(dnnlBlkDims.end() - inner_ndims, dnnlBlkDims.end(), dnn_blk_desc.inner_blks); std::copy(order.end() - inner_ndims, order.end(), dnn_blk_desc.inner_idxs); @@ -209,8 +227,10 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& } } -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const Shape& shape, dnnl::memory::data_type dataType, dnnl::memory::format_tag format) : - MemoryDesc(shape, DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const Shape& shape, + dnnl::memory::data_type dataType, + dnnl::memory::format_tag format) + : MemoryDesc(shape, DnnlBlocked) { using namespace dnnl; if (format == memory::format_tag::any || format == memory::format_tag::undef) OPENVINO_THROW("Unexpected: Can't create dnnl::desc with any or undef format"); @@ -249,7 +269,7 @@ bool DnnlBlockedMemoryDesc::isCompatible(const MemoryDesc& rhs) const { } } -bool DnnlBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc &rhs, CmpMask cmpMask) const { +bool DnnlBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const { if (auto desc = dynamic_cast(&rhs)) { return isCompatible(*desc, cmpMask); } else if (auto desc = dynamic_cast(&rhs)) { @@ -261,7 +281,8 @@ bool DnnlBlockedMemoryDesc::isCompatible(const BlockedMemoryDesc &rhs, CmpMask c bool DnnlBlockedMemoryDesc::isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask) const { dnnl::impl::memory_desc_wrapper wrapped(desc.get()); - return wrapped.extra().flags == dnnl_memory_extra_flag_none && BlockedMemoryDesc::isCompatibleInternal(rhs, cmpMask); + return wrapped.extra().flags == dnnl_memory_extra_flag_none && + BlockedMemoryDesc::isCompatibleInternal(rhs, cmpMask); } bool DnnlBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask) const { @@ -288,8 +309,10 @@ bool DnnlBlockedMemoryDesc::isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMa const auto thisExtra = wrappedThis.extra(); const auto rhsExtra = wrappedRhs.extra(); - return this->getOrder() == rhs.getOrder() && (thisExtra.flags == rhsExtra.flags && thisExtra.compensation_mask == rhsExtra.compensation_mask && - thisExtra.scale_adjust == rhsExtra.scale_adjust) && wrappedThis.similar_to(wrappedRhs, true, true, 0, true, checkOffset, stride_mask); + return this->getOrder() == rhs.getOrder() && + (thisExtra.flags == rhsExtra.flags && thisExtra.compensation_mask == rhsExtra.compensation_mask && + thisExtra.scale_adjust == rhsExtra.scale_adjust) && + wrappedThis.similar_to(wrappedRhs, true, true, 0, true, checkOffset, stride_mask); } static VectorDims extractOrder(const dnnl::memory::desc& desc) { @@ -300,7 +323,7 @@ static VectorDims extractOrder(const dnnl::memory::desc& desc) { OPENVINO_THROW("Unexpected: Cannot calculate order from undefined dims or strides"); } - const auto &blk_desc = descWrapped.blocking_desc(); + const auto& blk_desc = descWrapped.blocking_desc(); const size_t outer_ndims = dims.size(); const size_t inner_ndims = blk_desc.inner_nblks; @@ -319,11 +342,11 @@ static VectorDims extractOrder(const dnnl::memory::desc& desc) { // order of outer dims. In case of IOhw_ will be {1, 0, 2, 3} VectorDims outer_order(outer_ndims); std::iota(outer_order.begin(), outer_order.end(), 0); - std::sort(outer_order.begin(), outer_order.end(), - [&blk_desc, &outer_block_dims](size_t ind_l, size_t ind_r) { - return (blk_desc.strides[ind_l] > blk_desc.strides[ind_r]) || - (blk_desc.strides[ind_l] == blk_desc.strides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); - }); + std::sort(outer_order.begin(), outer_order.end(), [&blk_desc, &outer_block_dims](size_t ind_l, size_t ind_r) { + return (blk_desc.strides[ind_l] > blk_desc.strides[ind_r]) || + (blk_desc.strides[ind_l] == blk_desc.strides[ind_r] && + outer_block_dims[ind_l] > outer_block_dims[ind_r]); + }); // blocked order // [new_outer_order] U [inner_idxs] @@ -333,8 +356,8 @@ static VectorDims extractOrder(const dnnl::memory::desc& desc) { return blk_order; } -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc) : - MemoryDesc(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims), DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc) + : MemoryDesc(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims), DnnlBlocked) { desc = dnnl::memory::desc(DnnlExtensionUtils::clone_desc(cdesc)); if (desc.get_format_kind() == dnnl::memory::format_kind::any) @@ -356,16 +379,16 @@ DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc) : bool DnnlBlockedMemoryDesc::hasLayoutType(LayoutType layoutType) const { switch (layoutType) { - case LayoutType::ncsp: - return isPlainFormat(); - case LayoutType::nspc: - return isTailCFormat(); - case LayoutType::nCsp8c: - return isBlockedCFormat(8); - case LayoutType::nCsp16c: - return isBlockedCFormat(16); - default: - return false; + case LayoutType::ncsp: + return isPlainFormat(); + case LayoutType::nspc: + return isTailCFormat(); + case LayoutType::nCsp8c: + return isBlockedCFormat(8); + case LayoutType::nCsp16c: + return isBlockedCFormat(16); + default: + return false; } } @@ -382,8 +405,7 @@ bool DnnlBlockedMemoryDesc::isPlainFormat() const { } bool DnnlBlockedMemoryDesc::isBlockedCFormat(size_t blk_size) const { - if (desc.get_format_kind() != dnnl::memory::format_kind::blocked || - desc.get_inner_nblks() != 1 || + if (desc.get_format_kind() != dnnl::memory::format_kind::blocked || desc.get_inner_nblks() != 1 || desc.get_inner_idxs()[0] != 1) return false; @@ -452,13 +474,15 @@ static dnnl::memory::desc cloneDescWithNewDims(const dnnl::memory::desc& desc, return newMklDesc; } -MemoryDescPtr DnnlBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) const { - if (std::any_of(dims.begin(), dims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { +MemoryDescPtr DnnlBlockedMemoryDesc::cloneWithNewDimsImp(const VectorDims& dims) const { + if (std::any_of(dims.begin(), dims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { OPENVINO_THROW("Can't clone desc if new dims are undefined"); } // TODO [DS]: add stride recalculation for strided blobs - for (int i = strides.size() - 2; i >= 0 ; i--) { + for (int i = strides.size() - 2; i >= 0; i--) { if (strides[i] == Shape::UNDEFINED_DIM) break; @@ -499,7 +523,7 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { { const auto dims = desc.get_dims(); VectorDims total_block_per_dim(dims.size(), 1); - const auto &blk_desc = desc.get()->format_desc.blocking; + const auto& blk_desc = desc.get()->format_desc.blocking; for (int i = 0; i < blk_desc.inner_nblks; i++) { total_block_per_dim[blk_desc.inner_idxs[i]] *= blk_desc.inner_blks[i]; } @@ -509,10 +533,12 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { } std::iota(actualOrder.begin(), actualOrder.end(), 0); - std::sort(actualOrder.begin(), actualOrder.end(), - [&actualStrides, &outer_block_dims] (size_t ind_l, size_t ind_r) { + std::sort(actualOrder.begin(), + actualOrder.end(), + [&actualStrides, &outer_block_dims](size_t ind_l, size_t ind_r) { return (actualStrides[ind_l] > actualStrides[ind_r]) || - (actualStrides[ind_l] == actualStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); + (actualStrides[ind_l] == actualStrides[ind_r] && + outer_block_dims[ind_l] > outer_block_dims[ind_r]); }); } @@ -520,7 +546,7 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { { const auto dims = refDesc.get_dims(); VectorDims total_block_per_dim(dims.size(), 1); - const auto &blk_desc = refDesc.get()->format_desc.blocking; + const auto& blk_desc = refDesc.get()->format_desc.blocking; for (int i = 0; i < blk_desc.inner_nblks; i++) { total_block_per_dim[blk_desc.inner_idxs[i]] *= blk_desc.inner_blks[i]; } @@ -530,11 +556,10 @@ bool DnnlBlockedMemoryDesc::isSame(dnnl::memory::format_tag fmt) const { } std::iota(refOrder.begin(), refOrder.end(), 0); - std::sort(refOrder.begin(), refOrder.end(), - [&refStrides, &outer_block_dims] (size_t ind_l, size_t ind_r) { - return (refStrides[ind_l] > refStrides[ind_r]) || - (refStrides[ind_l] == refStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); - }); + std::sort(refOrder.begin(), refOrder.end(), [&refStrides, &outer_block_dims](size_t ind_l, size_t ind_r) { + return (refStrides[ind_l] > refStrides[ind_r]) || + (refStrides[ind_l] == refStrides[ind_r] && outer_block_dims[ind_l] > outer_block_dims[ind_r]); + }); } if (actualOrder != refOrder) { @@ -549,7 +574,9 @@ size_t DnnlBlockedMemoryDesc::getMaxMemSize() const { } const auto& maxDims = shape.getMaxDims(); - if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { + if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x) { + return Shape::UNDEFINED_DIM == x; + })) { return UNDEFINED_SIZE; } @@ -563,11 +590,13 @@ size_t DnnlBlockedMemoryDesc::getPaddedElementsCount() const { } auto padded_dims = desc.get_padded_dims(); - if (std::any_of(std::begin(padded_dims), std::begin(padded_dims) + desc.get_ndims(), - [](dnnl_dim_t dim) { return dim == DNNL_RUNTIME_DIM_VAL; })) { + if (std::any_of(std::begin(padded_dims), std::begin(padded_dims) + desc.get_ndims(), [](dnnl_dim_t dim) { + return dim == DNNL_RUNTIME_DIM_VAL; + })) { OPENVINO_THROW("Can't compute padded elements count for non undefined blocked dims"); } - return std::accumulate(std::begin(padded_dims), std::begin(padded_dims) + desc.get_ndims(), + return std::accumulate(std::begin(padded_dims), + std::begin(padded_dims) + desc.get_ndims(), size_t{1}, std::multiplies()); } @@ -586,7 +615,7 @@ void DnnlBlockedMemoryDesc::initBlockDims() { const auto dims = desc.get_dims(); const size_t outer_ndims = dims.size(); - const auto inner_ndims = desc.get_inner_nblks(); + const auto inner_ndims = desc.get_inner_nblks(); const size_t total_ndims = outer_ndims + inner_ndims; // total inner block size. in case of 4i16o4i will be {16, 16, 1, 1} @@ -612,10 +641,10 @@ void DnnlBlockedMemoryDesc::initBlockDims() { std::copy(order.begin(), order.begin() + outer_ndims, outer_order.begin()); blockedDims.resize(total_ndims, 0); - std::copy(inner_blks.begin(), inner_blks.begin() + inner_nblks, - blockedDims.end() - inner_nblks); - std::transform(outer_order.begin(), outer_order.end(), blockedDims.begin(), - [&] (size_t i) { return outer_block_dims[i]; }); + std::copy(inner_blks.begin(), inner_blks.begin() + inner_nblks, blockedDims.end() - inner_nblks); + std::transform(outer_order.begin(), outer_order.end(), blockedDims.begin(), [&](size_t i) { + return outer_block_dims[i]; + }); } void DnnlBlockedMemoryDesc::initStrides() { @@ -623,7 +652,7 @@ void DnnlBlockedMemoryDesc::initStrides() { const size_t outer_ndims = dims.size(); const size_t inner_nblks = desc.get_inner_nblks(); - const auto inner_blks = desc.get_inner_blks(); + const auto inner_blks = desc.get_inner_blks(); const size_t total_ndims = outer_ndims + inner_nblks; // strides of inner dims. In case of 4i16o4i will be {64, 4, 1} @@ -642,8 +671,9 @@ void DnnlBlockedMemoryDesc::initStrides() { std::copy(inner_strides.rbegin(), inner_strides.rend(), strides.rbegin()); const auto desc_strides = desc.get_strides(); - std::transform(outer_order.begin(), outer_order.end(), strides.begin(), - [&](size_t i) { return desc_strides[i] == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : desc_strides[i]; }); + std::transform(outer_order.begin(), outer_order.end(), strides.begin(), [&](size_t i) { + return desc_strides[i] == DNNL_RUNTIME_DIM_VAL ? Shape::UNDEFINED_DIM : desc_strides[i]; + }); } void DnnlBlockedMemoryDesc::initOffsetPadding() { @@ -659,15 +689,17 @@ MemoryDescPtr DnnlBlockedMemoryDesc::cloneWithNewPrecision(const ov::element::Ty } void DnnlBlockedMemoryDesc::recomputeDefaultStrides() { - const auto &rank = getShape().getRank(); + const auto& rank = getShape().getRank(); if (order.size() != blockedDims.size()) OPENVINO_THROW("Can't recompute stride: order size != blocked dims size"); - auto &oneDnnStrides = desc.get()->format_desc.blocking.strides; + auto& oneDnnStrides = desc.get()->format_desc.blocking.strides; if (getShape().hasZeroDims()) { std::fill(std::begin(oneDnnStrides), std::begin(oneDnnStrides) + getShape().getRank(), 0); - } else if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim val) { return val == Shape::UNDEFINED_DIM; })) { + } else if (std::any_of(blockedDims.begin(), blockedDims.end(), [](Dim val) { + return val == Shape::UNDEFINED_DIM; + })) { std::fill(std::begin(oneDnnStrides), std::begin(oneDnnStrides) + rank, DNNL_RUNTIME_DIM_VAL); initStrides(); } else { @@ -682,8 +714,8 @@ void DnnlBlockedMemoryDesc::recomputeDefaultStrides() { } } -DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const dnnl::memory::desc& mdesc, const Shape& shape) : - MemoryDesc(shape, DnnlBlocked) { +DnnlBlockedMemoryDesc::DnnlBlockedMemoryDesc(const dnnl::memory::desc& mdesc, const Shape& shape) + : MemoryDesc(shape, DnnlBlocked) { if (mdesc.get_format_kind() == dnnl::memory::format_kind::any) OPENVINO_THROW("Unexpected: Memory format any is prohibited!"); @@ -715,5 +747,5 @@ std::string DnnlBlockedMemoryDesc::serializeFormat() const { return BlockedMemoryDesc::serializeFormat(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h index a6c6a3297ba044..91388c12e2abf7 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.h @@ -4,19 +4,20 @@ #pragma once +#include + +#include "dnnl_extension_utils.h" #include "dnnl_memory_desc.h" #include "memory_desc/blocked_memory_desc.h" #include "openvino/util/util.hpp" -#include "dnnl_extension_utils.h" -#include namespace ov { namespace intel_cpu { class CpuBlockedMemoryDesc; -OPENVINO_DISABLE_WARNING_MSVC_BEGIN(4250) // Visual Studio warns us about inheritance via dominance but it's done intentionally - // so turn it off +OPENVINO_DISABLE_WARNING_MSVC_BEGIN(4250) // Visual Studio warns us about inheritance via dominance but it's done + // intentionally so turn it off class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { public: // Creates planar DnnlBlockedMemoryDesc @@ -30,8 +31,8 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const BlockedMemoryDesc& rhs, CmpMask cmpMask) const override; - bool isCompatible(const CpuBlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; - bool isCompatible(const DnnlBlockedMemoryDesc &rhs, CmpMask cmpMask = FULL_MASK) const; + bool isCompatible(const CpuBlockedMemoryDesc& rhs, CmpMask cmpMask = FULL_MASK) const; + bool isCompatible(const DnnlBlockedMemoryDesc& rhs, CmpMask cmpMask = FULL_MASK) const; const VectorDims& getBlockDims() const override { return blockedDims; @@ -63,17 +64,22 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { MemoryDescPtr cloneWithNewPrecision(const ov::element::Type prec) const override; - using DnnlMemoryDesc::setPrecision; using DnnlMemoryDesc::getPrecision; + using DnnlMemoryDesc::setPrecision; private: - DnnlBlockedMemoryDesc(ov::element::Type prc, const Shape& shape, const VectorDims& blockedDims, - const VectorDims& order, size_t offsetPadding = 0, const VectorDims& offsetPaddingToData = {}, + DnnlBlockedMemoryDesc(ov::element::Type prc, + const Shape& shape, + const VectorDims& blockedDims, + const VectorDims& order, + size_t offsetPadding = 0, + const VectorDims& offsetPaddingToData = {}, const VectorDims& strides = {}); - // Creates DnnlBlockedMemoryDesc using the shape parameter as a true shape but all other params (layout, blocks, etc.) are used from the mdesc, but - // the mdesc own shape is ignored. The main purpose of this constructor is making dynamic descriptor from some dummy mdesc, which stores info about - // layout, blocking, strides, etc., and the provided dynamic shape. + // Creates DnnlBlockedMemoryDesc using the shape parameter as a true shape but all other params (layout, blocks, + // etc.) are used from the mdesc, but the mdesc own shape is ignored. The main purpose of this constructor is making + // dynamic descriptor from some dummy mdesc, which stores info about layout, blocking, strides, etc., and the + // provided dynamic shape. DnnlBlockedMemoryDesc(const dnnl::memory::desc& mdesc, const Shape& shape); explicit DnnlBlockedMemoryDesc(const_dnnl_memory_desc_t cdesc); @@ -84,7 +90,8 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { bool isBlockedCFormat(size_t blk_size = UNREACHABLE_DIM) const; bool isTailCFormat() const; - // WA: we need to initialize blocked params into ctor to avoid bugs when we calculate these params in throughput mode + // WA: we need to initialize blocked params into ctor to avoid bugs when we calculate these params in throughput + // mode // TODO [DS]: should be reimplemented to avoid useless calculation void initBlockedParams() { initBlockDims(); @@ -99,7 +106,8 @@ class DnnlBlockedMemoryDesc : public BlockedMemoryDesc, public DnnlMemoryDesc { void recomputeDefaultStrides(); friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const_dnnl_memory_desc_t desc); - friend std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const dnnl::memory::desc &desc, const Shape& shape); + friend std::shared_ptr DnnlExtensionUtils::makeUndefinedDesc(const dnnl::memory::desc& desc, + const Shape& shape); friend class MemoryDescUtils; }; OPENVINO_DISABLE_WARNING_MSVC_END(4250) @@ -107,5 +115,5 @@ OPENVINO_DISABLE_WARNING_MSVC_END(4250) using DnnlBlockedMemoryDescPtr = std::shared_ptr; using DnnlBlockedMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp index 3e3af41cfc523a..375b218272ed57 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.cpp @@ -3,20 +3,21 @@ // #include "dnnl_memory_desc.h" -#include "dnnl_extension_utils.h" + #include #include + +#include "dnnl_extension_utils.h" #include "onednn/dnnl.h" namespace ov { namespace intel_cpu { -DnnlMemoryDesc::DnnlMemoryDesc(const dnnl::memory::desc& desc) : - DnnlMemoryDesc(desc.get()) {} +DnnlMemoryDesc::DnnlMemoryDesc(const dnnl::memory::desc& desc) : DnnlMemoryDesc(desc.get()) {} -DnnlMemoryDesc::DnnlMemoryDesc(const_dnnl_memory_desc_t cdesc) : - MemoryDesc(Shape(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims)), Dnnl), - desc(DnnlExtensionUtils::clone_desc(cdesc)) { +DnnlMemoryDesc::DnnlMemoryDesc(const_dnnl_memory_desc_t cdesc) + : MemoryDesc(Shape(DnnlExtensionUtils::convertToVectorDims(cdesc->dims, cdesc->ndims)), Dnnl), + desc(DnnlExtensionUtils::clone_desc(cdesc)) { if (getFormatKind() == dnnl::memory::format_kind::any) OPENVINO_THROW("Unexpected: Memory format any is prohibited!"); } @@ -35,7 +36,7 @@ MemoryDescPtr DnnlMemoryDesc::cloneWithNewPrecision(const ov::element::Type prec return newDesc; } -bool DnnlMemoryDesc::isCompatible(const MemoryDesc &rhs) const { +bool DnnlMemoryDesc::isCompatible(const MemoryDesc& rhs) const { if (MemoryDescType::Dnnl & rhs.getType()) { auto* dnnMemDesc = rhs.as(); return isCompatible(*dnnMemDesc); @@ -52,17 +53,25 @@ std::string DnnlMemoryDesc::serializeFormat() const { dnnl::impl::memory_desc_wrapper wrapped(desc.get()); if (wrapped.is_wino_desc()) { switch (desc.get()->format_desc.wino_desc.wino_format) { - case dnnl::impl::wino_memory_format_t::wino_wei_aaOio: return "wino_aaOio"; - case dnnl::impl::wino_memory_format_t::wino_wei_aaOBiOo: return "wino_aaOBiOo"; - case dnnl::impl::wino_memory_format_t::wino_wei_OBaaIBOIio: return "wino_OBaaIBOIio"; - default: return "wino_undef"; + case dnnl::impl::wino_memory_format_t::wino_wei_aaOio: + return "wino_aaOio"; + case dnnl::impl::wino_memory_format_t::wino_wei_aaOBiOo: + return "wino_aaOBiOo"; + case dnnl::impl::wino_memory_format_t::wino_wei_OBaaIBOIio: + return "wino_OBaaIBOIio"; + default: + return "wino_undef"; } } else if (wrapped.is_rnn_packed_desc()) { switch (desc.get()->format_desc.rnn_packed_desc.format) { - case dnnl::impl::rnn_packed_format::ldigo_p: return "packed_ldigo"; - case dnnl::impl::rnn_packed_format::ldgoi_p: return "packed_ldgoi"; - case dnnl::impl::rnn_packed_format::ldio_p: return "packed_ldio"; - default: return "packed_undef"; + case dnnl::impl::rnn_packed_format::ldigo_p: + return "packed_ldigo"; + case dnnl::impl::rnn_packed_format::ldgoi_p: + return "packed_ldgoi"; + case dnnl::impl::rnn_packed_format::ldio_p: + return "packed_ldio"; + default: + return "packed_undef"; } } return "undef"; @@ -116,7 +125,7 @@ bool DnnlMemoryDesc::isDefinedImp() const { return wrappedThis.offset0() != DNNL_RUNTIME_DIM_VAL; } -MemoryDescPtr DnnlMemoryDesc::cloneWithNewDimsImp(const VectorDims &dims) const { +MemoryDescPtr DnnlMemoryDesc::cloneWithNewDimsImp(const VectorDims& dims) const { OPENVINO_THROW("Unexpected: Cannot clone non blocked oneDNN desc with new dims"); } @@ -125,6 +134,5 @@ size_t DnnlMemoryDesc::getOffsetPadding() const { return DnnlExtensionUtils::convertToDim(wrap.offset0()); } - -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h index f2722a5170f871..6b3692c5663078 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_memory_desc.h @@ -4,11 +4,11 @@ #pragma once -#include "dnnl_extension_utils.h" #include #include -#include "memory_desc/cpu_memory_desc.h" + #include "dnnl_extension_utils.h" +#include "memory_desc/cpu_memory_desc.h" namespace ov { namespace intel_cpu { @@ -29,13 +29,17 @@ class DnnlMemoryDesc : public virtual MemoryDesc { bool isCompatible(const MemoryDesc& rhs) const override; bool isCompatible(const DnnlMemoryDesc& rhs) const; - bool hasLayoutType(LayoutType layoutType) const override { return false; } + bool hasLayoutType(LayoutType layoutType) const override { + return false; + } std::string serializeFormat() const override; size_t getMaxMemSize() const override; - virtual bool isSame(dnnl::memory::format_tag fmt) const { return false; } + virtual bool isSame(dnnl::memory::format_tag fmt) const { + return false; + } const dnnl::memory::desc& getDnnlDesc() const { return desc; @@ -70,10 +74,9 @@ class DnnlMemoryDesc : public virtual MemoryDesc { bool isDefinedImp() const override; MemoryDescPtr cloneWithNewDimsImp(const VectorDims& dims) const override; - friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc &desc); + friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const dnnl::memory::desc& desc); friend DnnlMemoryDescPtr DnnlExtensionUtils::makeDescriptor(const_dnnl_memory_desc_t desc); }; -} // namespace intel_cpu -} // namespace ov - +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h index 4b641669262591..c26cc6aa33a251 100644 --- a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h @@ -5,7 +5,6 @@ #pragma once #include "cpu_memory_desc.h" - #include "cpu_shape.h" #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" @@ -23,8 +22,7 @@ namespace intel_cpu { */ class EmptyMemoryDesc : public MemoryDesc { public: - EmptyMemoryDesc(): - MemoryDesc(Shape{0}, Empty) { + EmptyMemoryDesc() : MemoryDesc(Shape{0}, Empty) { /* status never changes for an empty memory desc * so "define" beforehand to ensure isDefined() is thread safe */ status = MemoryDesc::descStatus::Defined; @@ -59,7 +57,11 @@ class EmptyMemoryDesc : public MemoryDesc { } MemoryDescPtr cloneWithNewPrecision(const ov::element::Type prec) const override { - OPENVINO_THROW("Clone an empty memory desc with any precision (", prec, ") is prohibited"); + OPENVINO_ASSERT(prec == ov::element::undefined, + "Clone an empty memory desc with defined precision: ", + prec, + " is prohibited"); + return clone(); } private: @@ -90,5 +92,5 @@ class EmptyMemoryDesc : public MemoryDesc { using EmptyMemoryDescPtr = std::shared_ptr; using EmptyMemoryDescCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/memory_state.cpp b/src/plugins/intel_cpu/src/memory_state.cpp index aa06f4ebd82957..c0dc85c4103ce4 100644 --- a/src/plugins/intel_cpu/src/memory_state.cpp +++ b/src/plugins/intel_cpu/src/memory_state.cpp @@ -5,30 +5,33 @@ #include "memory_state.h" #include + #include "cpu_memory.h" +#include "cpu_tensor.h" +#include "dnnl_extension_utils.h" #include "memory_desc/cpu_blocked_memory_desc.h" #include "memory_desc/cpu_memory_desc_utils.h" -#include "dnnl_extension_utils.h" -#include "cpu_tensor.h" -#include "utils/plain_tensor.hpp" -#include "openvino/core/parallel.hpp" #include "nodes/common/cpu_convert.h" #include "nodes/kernels/scaled_attn/attn_quant.hpp" +#include "openvino/core/parallel.hpp" +#include "utils/plain_tensor.hpp" using namespace ov::Extensions::Cpu::XARCH; namespace ov { namespace intel_cpu { -VariableStateBase::VariableStateBase(const std::string& name, const MemoryDescPtr& external_desc) : - IVariableState{name} , m_external_desc{external_desc} {} +VariableStateBase::VariableStateBase(const std::string& name, const MemoryDescPtr& external_desc) + : IVariableState{name}, + m_external_desc{external_desc} {} MemoryDescPtr VariableStateBase::to_static(const MemoryDescPtr& desc) { if (!desc->isDefined()) { auto&& current_dims = desc->getShape().getDims(); VectorDims new_dims(current_dims.size()); std::transform(current_dims.begin(), current_dims.end(), new_dims.begin(), [](Dim x) { - return x == Shape::UNDEFINED_DIM ? 0 : x; }); + return x == Shape::UNDEFINED_DIM ? 0 : x; + }); return desc->cloneWithNewDims(new_dims, true); } @@ -71,21 +74,26 @@ ov::SoPtr VariableStateBase::get_state() const { return std::make_shared(internal_state_mem()); } - //test precision + // test precision { auto internal_prc = current_internal_desc->getPrecision(); auto tmp_desc = current_ext_desc->cloneWithNewPrecision(internal_prc); if (tmp_desc->isCompatible(*current_internal_desc)) { auto mem = std::make_shared(get_engine(), current_ext_desc); - size_t elements_to_convert = internal_state_mem()->getDescWithType()->getPaddedElementsCount(); + size_t elements_to_convert = + internal_state_mem()->getDescWithType()->getPaddedElementsCount(); auto external_prc = current_ext_desc->getPrecision(); - cpu_convert(internal_state_mem()->getData(), mem->getData(), internal_prc, external_prc, elements_to_convert); + cpu_convert(internal_state_mem()->getData(), + mem->getData(), + internal_prc, + external_prc, + elements_to_convert); return std::make_shared(mem); } } - //reorder + // reorder auto mem = std::make_shared(get_engine(), current_ext_desc); mem->load(*(internal_state_mem())); return std::make_shared(mem); @@ -108,19 +116,19 @@ void VariableStateBase::commit() { VariableStateDoubleBuffer::VariableStateDoubleBuffer(const std::string& name, const MemoryPtr& first_buffer, const MemoryPtr& second_buffer, - const MemoryDescPtr& external_desc) : - VariableStateBase(name, external_desc) { + const MemoryDescPtr& external_desc) + : VariableStateBase(name, external_desc) { OPENVINO_ASSERT(first_buffer && second_buffer); reset_prime_mem(first_buffer); reset_second_mem(second_buffer); m_internal_desc = prime_mem()->getDescPtr(); auto&& shape = m_internal_desc->getShape(); - //TODO what if by some reason we already have internal static state while the node is dynamic, is it even possible? + // TODO what if by some reason we already have internal static state while the node is dynamic, is it even possible? if (shape.isStatic()) { prime_mem()->nullify(); } else { - //in the case of the original desc has dynamic shape we create an empty tensor + // in the case of the original desc has dynamic shape we create an empty tensor auto new_desc = to_static(m_internal_desc); prime_mem()->redefineDesc(new_desc); } @@ -199,11 +207,11 @@ void VariableStateSingleBuffer::commit_impl() { // nothing to do } -VariableStateKVcache::VariableStateKVcache( - const std::string& name, - const MemoryDescPtr& external_desc, - const BlockedMemoryDescPtr& dense_internal_desc) : - VariableStateBase(name, external_desc), m_dense_internal_desc(dense_internal_desc) { +VariableStateKVcache::VariableStateKVcache(const std::string& name, + const MemoryDescPtr& external_desc, + const BlockedMemoryDescPtr& dense_internal_desc) + : VariableStateBase(name, external_desc), + m_dense_internal_desc(dense_internal_desc) { auto&& shape = external_desc->getShape(); OPENVINO_ASSERT(shape.isDynamic(), "VariableStateKVcache is unexpectedly initalized with a static tensor"); @@ -227,7 +235,7 @@ ov::SoPtr VariableStateKVcache::get_state() const { OPENVINO_ASSERT(actual_external_desc->getShape().getRank() == 4); auto&& actual_internal_order = actual_internal_desc->getOrder(); - //sanity check + // sanity check OPENVINO_ASSERT(actual_internal_order == m_dense_internal_desc->getOrder()); PlainTensor output, pastkv, beam_table; @@ -253,20 +261,12 @@ ov::SoPtr VariableStateKVcache::get_state() const { S, m_scale_zp.ptr(m, b_kv, h)[0], m_scale_zp.ptr(m, b_kv, h)[1]); - cpu_convert(buffers[ithr].ptr(), - output.ptr_v(m, b, h), - element::f32, - output.m_dt, - S); + cpu_convert(buffers[ithr].ptr(), output.ptr_v(m, b, h), element::f32, output.m_dt, S); }); } else { parallel_for3d(L0, B, H, [&](size_t m, size_t b, size_t h) { auto b_kv = static_cast(beam_table.at({b, m})); - cpu_convert(pastkv.ptr_v(m, b_kv, h), - output.ptr_v(m, b, h), - pastkv.m_dt, - output.m_dt, - S); + cpu_convert(pastkv.ptr_v(m, b_kv, h), output.ptr_v(m, b, h), pastkv.m_dt, output.m_dt, S); }); } @@ -274,11 +274,11 @@ ov::SoPtr VariableStateKVcache::get_state() const { } void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { - //1. reset the memory object - m_state = state; // simply to extend the lifetime + // 1. reset the memory object + m_state = state; // simply to extend the lifetime auto state_desc = MemoryDescUtils::generateCpuBlockedMemoryDesc(m_state); - //May be optimized by reusing the state tensor underlining memory pointer, but corner cases should be considered + // May be optimized by reusing the state tensor underlining memory pointer, but corner cases should be considered auto dense_internal_desc = m_dense_internal_desc->cloneWithNewDims(state_desc->getShape().getStaticDims()); m_internal_mem = std::make_shared(get_engine(), dense_internal_desc); @@ -287,7 +287,10 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { if (dense_internal_desc->getPrecision() == element::u8) { PlainTensor external, internal; auto&& actual_internal_order = m_dense_internal_desc->getOrder(); - external.resize(external_mem.getStaticDims(), state_desc->getPrecision().size(), state_desc->getPrecision(), m_state->data()); + external.resize(external_mem.getStaticDims(), + state_desc->getPrecision().size(), + state_desc->getPrecision(), + m_state->data()); internal.reset(m_internal_mem); external = external.permute(actual_internal_order); internal = internal.permute(actual_internal_order); @@ -300,11 +303,7 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { m_scale_zp.resize({L0, B, H, 2}); parallel_for3d(B, H, L0, [&](size_t ithr, size_t b, size_t h, size_t m) { buffers[ithr].resize({S}); - cpu_convert(external.ptr_v(m, b, h), - buffers[ithr].ptr(), - external.m_dt, - element::f32, - S); + cpu_convert(external.ptr_v(m, b, h), buffers[ithr].ptr(), external.m_dt, element::f32, S); attn_quant_u8(buffers[ithr].ptr(), internal.ptr(m, b, h), S, @@ -315,14 +314,13 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { m_internal_mem->load(external_mem); } - //2. Reset the beam search table + // 2. Reset the beam search table auto&& state_dims = dense_internal_desc->getShape().getStaticDims(); auto&& order = m_dense_internal_desc->getOrder(); const size_t size_B = state_dims[order.at(1)]; const size_t size_L = state_dims[order.at(0)]; - auto mem_desc = - std::make_shared(ov::element::i32, Shape{size_B, size_L}); + auto mem_desc = std::make_shared(ov::element::i32, Shape{size_B, size_L}); m_hidden_state = std::make_shared(get_engine(), mem_desc); auto buff = m_hidden_state->getDataAs(); @@ -336,11 +334,11 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr& state) { } void VariableStateKVcache::reset_impl() { - //nothing to do + // nothing to do } void VariableStateKVcache::commit_impl() { - //nothing to do + // nothing to do } MemoryPtr VariableStateKVcache::input_mem() { @@ -352,7 +350,7 @@ MemoryPtr VariableStateKVcache::output_mem() { } MemoryDescPtr VariableStateKVcache::internal_desc() const { - return m_dense_internal_desc; //since we don't store initial one + return m_dense_internal_desc; // since we don't store initial one } MemoryPtr VariableStateKVcache::internal_state_mem() const { diff --git a/src/plugins/intel_cpu/src/memory_state.h b/src/plugins/intel_cpu/src/memory_state.h index e7493f327e93fa..f35e78989b02f8 100644 --- a/src/plugins/intel_cpu/src/memory_state.h +++ b/src/plugins/intel_cpu/src/memory_state.h @@ -29,12 +29,12 @@ class VariableStateBase : public IVariableState { public: VariableStateBase(const std::string& name, const MemoryDescPtr& external_desc); - //ov::IVariableState - void set_state(const ov::SoPtr& state) override final; // NOLINT + // ov::IVariableState + void set_state(const ov::SoPtr& state) override final; // NOLINT ov::SoPtr get_state() const override; - void reset() override final; // NOLINT - bool is_reset_state() const override final; // NOLINT - void commit() override final; // NOLINT + void reset() override final; // NOLINT + bool is_reset_state() const override final; // NOLINT + void commit() override final; // NOLINT protected: virtual MemoryPtr internal_state_mem() const = 0; @@ -66,7 +66,7 @@ class VariableStateDoubleBuffer : public VariableStateBase { MemoryDescPtr internal_desc() const override; private: - //ov::intel_cpu::VariableStateBase + // ov::intel_cpu::VariableStateBase void reset_impl() override; void commit_impl() override; @@ -89,7 +89,7 @@ class VariableStateDoubleBuffer : public VariableStateBase { MemoryPtr internal_state_mem() const override; private: - MemoryDescPtr m_internal_desc; //mem desc required by the graph internal tensor + MemoryDescPtr m_internal_desc; // mem desc required by the graph internal tensor std::array m_internal_mem{}; size_t buffer_num = 0; }; @@ -111,7 +111,7 @@ class VariableStateSingleBuffer : public VariableStateBase { MemoryPtr internal_state_mem() const override; private: - MemoryDescPtr m_internal_desc; //mem desc required by the graph internal tensor + MemoryDescPtr m_internal_desc; // mem desc required by the graph internal tensor MemoryPtr m_internal_mem; }; @@ -121,10 +121,10 @@ class VariableStateKVcache : public VariableStateBase { const MemoryDescPtr& external_desc, const BlockedMemoryDescPtr& dense_internal_desc); - //ov::IVariableState + // ov::IVariableState ov::SoPtr get_state() const override; - //ov::intel_cpu::VariableStateBase + // ov::intel_cpu::VariableStateBase MemoryPtr input_mem() override; MemoryPtr output_mem() override; MemoryDescPtr internal_desc() const override; @@ -158,14 +158,14 @@ class VariableStateKVcache : public VariableStateBase { } private: - //ov::intel_cpu::VariableStateBase + // ov::intel_cpu::VariableStateBase void set_state_impl(const ov::SoPtr& state) override; void reset_impl() override; void commit_impl() override; private: - MemoryPtr m_internal_mem; // kv cache - MemoryPtr m_hidden_state; // beam access table + MemoryPtr m_internal_mem; // kv cache + MemoryPtr m_hidden_state; // beam access table size_t m_internal_mem_max_size = 0; size_t m_hidden_state_max_size = 0; @@ -178,5 +178,5 @@ class VariableStateKVcache : public VariableStateBase { using MemStatePtr = std::shared_ptr; using MemStateCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/mlas/thread_pool.hpp b/src/plugins/intel_cpu/src/mlas/thread_pool.hpp index 536b3746be1d69..5af8b0cce915fa 100644 --- a/src/plugins/intel_cpu/src/mlas/thread_pool.hpp +++ b/src/plugins/intel_cpu/src/mlas/thread_pool.hpp @@ -7,6 +7,7 @@ #include #include #include + #include "mlas.h" namespace ov { @@ -17,6 +18,7 @@ class OVMlasThreadPool : public IMlasThreadPool { explicit OVMlasThreadPool(const size_t& threadNum) : threadNum(threadNum) {} size_t DegreeOfParallelism() override; void TrySimpleParallelFor(const std::ptrdiff_t total, const std::function& fn) override; + public: // the actual threads used for sgemm size_t threadNum = 0; diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index f4c2b0eb686df6..ddf8d068f920a2 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -3,38 +3,38 @@ // #include "node.h" -#include "cpu_types.h" -#include "edge.h" -#include "partitioned_mem_blk.h" +#include +#include + +#include +#include +#include #include #include -#include +#include #include -#include #include +#include +#include "cpu_types.h" +#include "dnnl_extension_utils.h" +#include "edge.h" +#include "memory_desc/cpu_memory_desc_utils.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "nodes/common/cpu_convert.h" #include "nodes/conv.h" #include "nodes/eltwise.h" #include "nodes/input.h" -#include "nodes/reorder.h" #include "nodes/reference.h" -#include "dnnl_extension_utils.h" - +#include "nodes/reorder.h" +#include "openvino/core/type/element_type.hpp" +#include "partitioned_mem_blk.h" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" +#include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" #include "utils/rt_info/memory_formats_attribute.hpp" -#include - -#include -#include -#include "utils/general_utils.h" -#include "utils/cpu_utils.hpp" -#include "nodes/common/cpu_convert.h" -#include "memory_desc/cpu_memory_desc_utils.h" -#include "memory_desc/dnnl_blocked_memory_desc.h" -#include -#include using namespace dnnl; using namespace openvino; @@ -43,7 +43,7 @@ using namespace ov::intel_cpu::node; namespace ov { namespace intel_cpu { -Node::NodesFactory & Node::factory() { +Node::NodesFactory& Node::factory() { static NodesFactory factoryInstance; return factoryInstance; } @@ -62,7 +62,7 @@ Node::Node(const std::shared_ptr& op, type(TypeFromName(op->get_type_name())), profiling(op->get_friendly_name()) { for (size_t i = 0; i < op->get_input_size(); i++) { - const auto &shape = op->get_input_partial_shape(i); + const auto& shape = op->get_input_partial_shape(i); if (shape.rank().is_dynamic()) { OPENVINO_THROW("Unexpected: CPU plug-in doesn't support ", getTypeStr(), @@ -82,7 +82,7 @@ Node::Node(const std::shared_ptr& op, OPENVINO_THROW("Node with type '", typeStr, "' and name '", name, "' does not have any outputs."); } for (size_t i = 0; i < op->get_output_size(); i++) { - const auto &shape = op->get_output_partial_shape(i); + const auto& shape = op->get_output_partial_shape(i); if (shape.rank().is_dynamic()) { OPENVINO_THROW("Unexpected: CPU plug-in doesn't support ", getTypeStr(), @@ -98,8 +98,14 @@ Node::Node(const std::shared_ptr& op, childEdges.reserve(outputShapes.size()); } - isDynamic = std::any_of(inputShapes.begin(), inputShapes.end(), [](const Shape& shape){ return shape.isDynamic(); }) || - std::any_of(outputShapes.begin(), outputShapes.end(), [](const Shape& shape){ return shape.isDynamic(); }); + isDynamic = std::any_of(inputShapes.begin(), + inputShapes.end(), + [](const Shape& shape) { + return shape.isDynamic(); + }) || + std::any_of(outputShapes.begin(), outputShapes.end(), [](const Shape& shape) { + return shape.isDynamic(); + }); if (isDynamic) { shapeInference = shapeInferFactory.makeShapeInfer(); @@ -126,12 +132,13 @@ Node::Node(const std::shared_ptr& op, if (str.substr(0, 4) != "cpu:") continue; customImplPriorities.push_back(parse_impl_name(str)); - if (customImplPriorities.back() == impl_desc_type::unknown && - str != "cpu:unknown") + if (customImplPriorities.back() == impl_desc_type::unknown && str != "cpu:unknown") OPENVINO_THROW("Unsupported CPU implementation ", str, " for node ", getName()); } const auto& defaultImplPriorities = getDefaultImplPriority(); - customImplPriorities.insert(customImplPriorities.end(), defaultImplPriorities.begin(), defaultImplPriorities.end()); + customImplPriorities.insert(customImplPriorities.end(), + defaultImplPriorities.begin(), + defaultImplPriorities.end()); } std::string inputMemoryFormats = getInputMemoryFormats(op); @@ -198,10 +205,11 @@ void Node::addEdge(const EdgePtr& edge) { } void Node::remove() { - auto drop = [](std::vector edges){ + auto drop = [](std::vector edges) { for (auto& edge : edges) { auto edgePtr = edge.lock(); - if (!edgePtr) continue; + if (!edgePtr) + continue; edgePtr->getParent()->removeChildEdge(edgePtr); edgePtr->getChild()->removeParentEdge(edgePtr); } @@ -212,7 +220,7 @@ void Node::remove() { } bool Node::isEdgesEmpty(const std::vector& edges) const { - for (auto &edge : edges) { + for (auto& edge : edges) { if (edge.lock()) return false; } @@ -264,7 +272,8 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector& pr auto parentEdge = getParentEdgeAt(j); auto parentPtr = parentEdge->getParent(); - // We don't take into account constant edges since reorders on them will be executed on load network stage + // We don't take into account constant edges since reorders on them will be executed on load network + // stage if (ignoreConstInputs && j > 0 && parentPtr->isConstant()) { equalsLocalFormatCount++; continue; @@ -285,10 +294,20 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector& pr equalsLocalFormatCount++; } - DEBUG_LOG(getName(), " pd[", i, "].inConfs[", j, "]" - " is ", (isCompatible ? "compatible" : "not compatible"), - " with parent ", parentPtr->getName(), - " outConfs[", inNum, "], equalsLocalFormatCount add to ", equalsLocalFormatCount); + DEBUG_LOG(getName(), + " pd[", + i, + "].inConfs[", + j, + "]" + " is ", + (isCompatible ? "compatible" : "not compatible"), + " with parent ", + parentPtr->getName(), + " outConfs[", + inNum, + "], equalsLocalFormatCount add to ", + equalsLocalFormatCount); } if (equalsLocalFormatCount > equalsFormatCount) { @@ -333,7 +352,8 @@ bool Node::isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu:: return !(isOneDimShape1 && isOneDimShape2 && samePrec); } -void Node::selectPreferPrimitiveDescriptorWithShape(const std::vector& priority, bool ignoreConstInputs) { +void Node::selectPreferPrimitiveDescriptorWithShape(const std::vector& priority, + bool ignoreConstInputs) { // Filter out dynamic shape. if (isDynamic) { return selectPreferPrimitiveDescriptor(priority, ignoreConstInputs); @@ -370,11 +390,22 @@ void Node::selectPreferPrimitiveDescriptorWithShape(const std::vectorgetShape().toPartialShape()) ? "one dim shape" : "not one dim shape"), - " with parent ", parentPtr->getName(), - " outConfs[", inNum, "], estimate add to ", estimate); + DEBUG_LOG(getName(), + " pd[", + i, + "].inConfs[", + j, + "]" + " is ", + (isCompatible ? "compatible" : "not compatible"), + " shape is ", + (isOneDimShape(curDesc->getShape().toPartialShape()) ? "one dim shape" : "not one dim shape"), + " with parent ", + parentPtr->getName(), + " outConfs[", + inNum, + "], estimate add to ", + estimate); } } return estimate; @@ -442,7 +473,7 @@ bool Node::canBeInPlace() const { } if (getParentEdges().size() != 1 || getParentEdgeAt(0)->getParent()->getChildEdges().size() != 1 || - (getParentEdgeAt(0)->getParent()->isConstant() && !getParentEdgeAt(0)->getChild()->isConstant())) + (getParentEdgeAt(0)->getParent()->isConstant() && !getParentEdgeAt(0)->getChild()->isConstant())) return false; // TODO: we need to extend this logic to properly handle all possible inplace conflicts @@ -462,7 +493,7 @@ bool Node::canBeInPlace() const { } void Node::resolveInPlaceEdges(Edge::LOOK look) { - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (!selected_pd) OPENVINO_THROW("Cannot find selected primitive descriptor for node: ", getName()); if (look & Edge::LOOK_DOWN) { @@ -477,16 +508,19 @@ void Node::resolveInPlaceEdges(Edge::LOOK look) { " Unexpected inplace resolve call to an allocated edge: ", *parentEdge); - //search for already allocated edge + // search for already allocated edge const auto& childEdges = getChildEdgesAtPort(inplaceOutIndx); - auto itr = std::find_if(childEdges.begin(), childEdges.end(), [](const EdgePtr& edge) { return edge->getStatus() == Edge::Status::Allocated; }); + auto itr = std::find_if(childEdges.begin(), childEdges.end(), [](const EdgePtr& edge) { + return edge->getStatus() == Edge::Status::Allocated; + }); OPENVINO_ASSERT(itr != childEdges.end(), " Could not find an allocated edge to resolve in-place for node: ", getName()); auto baseMemBlock = (*itr)->getMemory().getMemoryBlock(); auto memBlock = std::make_shared(baseMemBlock); - auto newMem = std::make_shared(getEngine(), selected_pd->getConfig().inConfs[i].getMemDesc(), memBlock); + auto newMem = + std::make_shared(getEngine(), selected_pd->getConfig().inConfs[i].getMemDesc(), memBlock); parentEdge->reuse(newMem); } } @@ -505,7 +539,8 @@ void Node::resolveInPlaceEdges(Edge::LOOK look) { OPENVINO_ASSERT(childEdge->getStatus() == Edge::Status::NotAllocated, " Unexpected inplace resolve call to an allocated edge: ", *childEdge); - auto newMem = std::make_shared(getEngine(), selected_pd->getConfig().outConfs[i].getMemDesc(), memBlock); + auto newMem = + std::make_shared(getEngine(), selected_pd->getConfig().outConfs[i].getMemDesc(), memBlock); childEdge->reuse(newMem); } } @@ -565,9 +600,9 @@ std::string Node::getPrimitiveDescriptorType() const { str_type += t; }; -#define SEARCH_TYPE(_type) \ - if ((type & impl_desc_type::_type) == impl_desc_type::_type) \ - add_type(#_type) +#define SEARCH_TYPE(_type) \ + if ((type & impl_desc_type::_type) == impl_desc_type::_type) \ + add_type(#_type) SEARCH_TYPE(undef); SEARCH_TYPE(reorder); @@ -608,13 +643,19 @@ std::string Node::getPrimitiveDescriptorType() const { if (selectedPrimitiveDesc) { if (!selectedPrimitiveDesc->getConfig().inConfs.empty()) { if (selectedPrimitiveDesc->getConfig().inConfs[0].getMemDesc()->getPrecision() != ov::element::u8) { - str_type += "_" + std::string(selectedPrimitiveDesc->getConfig().inConfs[0].getMemDesc()->getPrecision().get_type_name()); + str_type += + "_" + + std::string( + selectedPrimitiveDesc->getConfig().inConfs[0].getMemDesc()->getPrecision().get_type_name()); } else { str_type += "_I8"; } } else { if (selectedPrimitiveDesc->getConfig().outConfs[0].getMemDesc()->getPrecision() != ov::element::u8) { - str_type += "_" + std::string(selectedPrimitiveDesc->getConfig().outConfs[0].getMemDesc()->getPrecision().get_type_name()); + str_type += + "_" + + std::string( + selectedPrimitiveDesc->getConfig().outConfs[0].getMemDesc()->getPrecision().get_type_name()); } else { str_type += "_I8"; } @@ -650,7 +691,7 @@ std::vector Node::getChildEdgesAtPort(int inputNum) const { OPENVINO_THROW("Node ", getName(), " contains less output ports than ", inputNum); std::vector res; - for (auto &edge_w : childEdges) { + for (auto& edge_w : childEdges) { auto edge = edge_w.lock(); if (!edge) OPENVINO_THROW("Node ", getName(), " contains dead weak ptr"); @@ -660,7 +701,7 @@ std::vector Node::getChildEdgesAtPort(int inputNum) const { return res; } -std::vector Node::getAvailableFormatsForDims(const Shape &dims) const { +std::vector Node::getAvailableFormatsForDims(const Shape& dims) const { if (dims.getRank() == 0) return {memory::format_tag::x}; else if (dims.getRank() == 1) @@ -668,8 +709,11 @@ std::vector Node::getAvailableFormatsForDims(const Shape &di else if (dims.getRank() == 2) return {memory::format_tag::nc}; else if (dims.getRank() == 3) - return {memory::format_tag::tnc, memory::format_tag::ntc, - memory::format_tag::ncw, memory::format_tag::nCw8c, memory::format_tag::nCw16c }; + return {memory::format_tag::tnc, + memory::format_tag::ntc, + memory::format_tag::ncw, + memory::format_tag::nCw8c, + memory::format_tag::nCw16c}; else if (dims.getRank() == 4) return {memory::format_tag::nchw, memory::format_tag::nChw8c, memory::format_tag::nChw16c}; else if (dims.getRank() == 5) @@ -694,36 +738,36 @@ void Node::updateShapes() { getTypeStr(), " with name: ", getName()); - try { - if (needShapeInfer()) { - auto result = shapeInfer(); - if (ShapeInferStatus::success == result.status) { - redefineOutputMemory(result.dims); + try { + if (needShapeInfer()) { + auto result = shapeInfer(); + if (ShapeInferStatus::success == result.status) { + redefineOutputMemory(result.dims); + } + } else { + // guard check for internal dynamic nodes to avoid possible overestimation of the required memory size + if (shapeInference && FULL_PORT_MASK == shapeInference->get_port_mask()) + return; + + for (auto&& edge : getChildEdges()) { + auto edge_ptr = edge.lock(); + CPU_NODE_ASSERT(edge_ptr, " has null edge"); + if (edge_ptr->inPlace(Edge::LOOK_UP)) { + continue; } - } else { - //guard check for internal dynamic nodes to avoid possible overestimation of the required memory size - if (shapeInference && FULL_PORT_MASK == shapeInference->get_port_mask()) - return; - - for (auto&& edge : getChildEdges()) { - auto edge_ptr = edge.lock(); - CPU_NODE_ASSERT(edge_ptr, " has null edge"); - if (edge_ptr->inPlace(Edge::LOOK_UP)) { - continue; - } - auto mem = edge_ptr->getMemoryPtr(); - CPU_NODE_ASSERT(mem, " has null output memory"); + auto mem = edge_ptr->getMemoryPtr(); + CPU_NODE_ASSERT(mem, " has null output memory"); - if (mem->getShape().hasZeroDims()) { - continue; - } - fetchRawMemory(mem); + if (mem->getShape().hasZeroDims()) { + continue; } + fetchRawMemory(mem); } - } catch (const std::exception& exp) { - THROW_CPU_NODE_ERR(exp.what()); } + } catch (const std::exception& exp) { + THROW_CPU_NODE_ERR(exp.what()); + } } void Node::updateDynamicParams() { @@ -735,10 +779,17 @@ void Node::updateDynamicParams() { try { if (isExecutable()) { if (needPrepareParams()) { - OPENVINO_ASSERT(inputShapesDefined(), - "Input shapes are not defined."); - DEBUG_LOG(" prepareParams() on #", getExecIndex(), " ", getTypeStr(), " ", algToString(getAlgorithm()), - " ", getName(), " ", getOriginalLayers()); + OPENVINO_ASSERT(inputShapesDefined(), "Input shapes are not defined."); + DEBUG_LOG(" prepareParams() on #", + getExecIndex(), + " ", + getTypeStr(), + " ", + algToString(getAlgorithm()), + " ", + getName(), + " ", + getOriginalLayers()); prepareParams(); } } @@ -781,7 +832,7 @@ bool Node::outputShapeDataDependency() const { return false; } -void Node::redefineOutputMemory(const std::vector &newOutputShapes) { +void Node::redefineOutputMemory(const std::vector& newOutputShapes) { if (newOutputShapes.size() != outputShapes.size()) { OPENVINO_THROW("Number shapes mismatch with real outputs number for node with name: ", getName()); } @@ -840,34 +891,45 @@ void Node::initSupportedPrimitiveDescriptors() { }; /* When custom implementation priorities are NOT defined it is enough to - * just use the first implementation from the priority list. - * When custom implementation priorities are defined, all the implementations should be considered, - * since custom implementations can be not available at all, so a fallback to the default ones must happen - * To achive the fallback, it is necessary to create a supported primitive descriptor for each implementation - * since oneDNN primitive is mutating while iterating */ + * just use the first implementation from the priority list. + * When custom implementation priorities are defined, all the implementations should be considered, + * since custom implementations can be not available at all, so a fallback to the default ones must happen + * To achive the fallback, it is necessary to create a supported primitive descriptor for each implementation + * since oneDNN primitive is mutating while iterating */ #ifdef CPU_DEBUG_CAPS { - if (!customImplPriorities.empty()) { - DEBUG_LOG("#", getName(), " customImplPriorities [", 0 , "/", customImplPriorities.size(), - "]: ", impl_type_to_string(customImplPriorities[0])); - } + if (!customImplPriorities.empty()) { + DEBUG_LOG("#", + getName(), + " customImplPriorities [", + 0, + "/", + customImplPriorities.size(), + "]: ", + impl_type_to_string(customImplPriorities[0])); + } } #endif for (auto& desc : descs) { auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); const bool first_match = customImplPriorities.empty(); - DEBUG_LOG("#", getName(), - ", itpd.impl_info_str(): ", desc.impl_info_str(), - ", parsed imp_type: ", impl_type_to_string(parse_impl_name(desc.impl_info_str())), - ", first_match: ", first_match ? "true" : "false"); - DnnlExtensionUtils::for_each_implementation(desc, - first_match, - [&](impl_desc_type implType) { - return contains(getImplPriority(), implType); - }, - [&](dnnl::primitive_desc& desc) { - addSupportedPrimitiveDescriptor(desc); - }); + DEBUG_LOG("#", + getName(), + ", itpd.impl_info_str(): ", + desc.impl_info_str(), + ", parsed imp_type: ", + impl_type_to_string(parse_impl_name(desc.impl_info_str())), + ", first_match: ", + first_match ? "true" : "false"); + DnnlExtensionUtils::for_each_implementation( + desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + [&](dnnl::primitive_desc& desc) { + addSupportedPrimitiveDescriptor(desc); + }); // fallback. if none of the primitive types is present in the priority list just add first implementation // @todo this fallback is not necessary if primitive priority list is filled correctly @@ -888,22 +950,29 @@ void Node::filterSupportedPrimitiveDescriptors() { }; auto isNotSuitableDesc = [&](const NodeDesc& desc) { - const auto &config = desc.getConfig(); - if (inputMemoryFormatsFilter.size() > config.inConfs.size() || outputMemoryFormatsFilter.size() > config.outConfs.size()) + const auto& config = desc.getConfig(); + if (inputMemoryFormatsFilter.size() > config.inConfs.size() || + outputMemoryFormatsFilter.size() > config.outConfs.size()) OPENVINO_THROW("Incorrect number of input or output memory formats"); for (size_t i = 0; i < inputMemoryFormatsFilter.size(); i++) { if (!areCompatible(*config.inConfs[i].getMemDesc(), inputMemoryFormatsFilter[i])) { - DEBUG_LOG(getName(), " input memory format filter: ", inputMemoryFormatsFilter[i], - " not matched. Erase desc from supported primitive descriptors: ", desc); + DEBUG_LOG(getName(), + " input memory format filter: ", + inputMemoryFormatsFilter[i], + " not matched. Erase desc from supported primitive descriptors: ", + desc); return true; } } for (size_t i = 0; i < outputMemoryFormatsFilter.size(); i++) { if (!areCompatible(*config.outConfs[i].getMemDesc(), outputMemoryFormatsFilter[i])) { - DEBUG_LOG(getName(), " Output memory format filter: ", outputMemoryFormatsFilter[i], - " not matched. Erase desc from supported primitive descriptors: ", desc); + DEBUG_LOG(getName(), + " Output memory format filter: ", + outputMemoryFormatsFilter[i], + " not matched. Erase desc from supported primitive descriptors: ", + desc); return true; } } @@ -931,7 +1000,8 @@ void Node::initDescriptor(const NodeConfig& config) { if (descs.empty()) { const auto& selectedConfig = selectedPD->getConfig(); - if (selectedConfig.inConfs.size() != config.inConfs.size() || selectedConfig.outConfs.size() != config.outConfs.size()) + if (selectedConfig.inConfs.size() != config.inConfs.size() || + selectedConfig.outConfs.size() != config.outConfs.size()) return; for (size_t i = 0; i < selectedConfig.inConfs.size(); i++) { @@ -948,19 +1018,19 @@ void Node::initDescriptor(const NodeConfig& config) { return; } - auto updateNodeConfig = [&](const NodeConfig& cfg){ + auto updateNodeConfig = [&](const NodeConfig& cfg) { auto updatedConfig = cfg; for (size_t i = 0; i < descInputNumbers(); i++) { PortConfig& dataConfig = updatedConfig.inConfs[i]; - dataConfig.inPlace(canBeInPlace() ? 0 : -1); // update inPlace - dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask + dataConfig.inPlace(canBeInPlace() ? 0 : -1); // update inPlace + dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask } for (size_t i = 0; i < descOutputNumbers(); i++) { PortConfig& dataConfig = updatedConfig.outConfs[i]; - dataConfig.inPlace(-1); // update inPlace - dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask + dataConfig.inPlace(-1); // update inPlace + dataConfig.setMemDesc(dataConfig.getMemDesc()); // reset desc with default compatibility mask } return updatedConfig; @@ -1016,8 +1086,8 @@ void Node::prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx) { MemoryPtr ptr; auto weightCache = context->getWeightsCache(); if (weightCache != nullptr && memory::format_kind::blocked == intDesc->getDnnlDesc().get_format_kind()) { - const auto string_hash = - name + "_" + std::to_string(indx) + "_" + DnnlExtensionUtils::computeWeightsStringHash(internalBlob, intDesc); + const auto string_hash = name + "_" + std::to_string(indx) + "_" + + DnnlExtensionUtils::computeWeightsStringHash(internalBlob, intDesc); ptr = *weightCache->findOrCreate(string_hash, create); } else { ptr = create(); @@ -1042,7 +1112,7 @@ void Node::prepareMemory(const std::vector& intDescs) { void Node::prepareMemory(dnnl::primitive_desc_iterator& itpd) { std::vector intDescs; - for (auto &it : internalBlobDesc) + for (auto& it : internalBlobDesc) intDescs.push_back(it(itpd, 0)); Node::prepareMemory(intDescs); @@ -1062,8 +1132,8 @@ MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryD srcWeightDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc); } - auto create = [&] () { - Memory srcMemory{ getEngine(), srcWeightDesc, edgeMem->getData() }; + auto create = [&]() { + Memory srcMemory{getEngine(), srcWeightDesc, edgeMem->getData()}; MemoryPtr _ptr = std::make_shared(getEngine(), dstWeightDesc); node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache()); @@ -1106,7 +1176,7 @@ void Node::toNumaNodeImpl(int numaNodeID) { // create scratch pad from specified numa node if (scratchpadMem) { - scratchpadMem = context->getScratchPad(numaNodeID)->createScratchPadMem(scratchpadMem->getDescPtr()); + scratchpadMem = context->getScratchPad()->createScratchPadMem(scratchpadMem->getDescPtr()); primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->getPrimitive(); } @@ -1127,13 +1197,13 @@ bool Node::isInPlace() const { inplace = InPlaceType::NoInPlace; auto config = selected_pd->getConfig(); - for (auto &in : config.inConfs) { + for (auto& in : config.inConfs) { if (in.inPlace() >= 0) { inplace = InPlaceType::InPlace; break; } } - for (auto &out : config.outConfs) { + for (auto& out : config.outConfs) { if (out.inPlace() >= 0) { inplace = InPlaceType::InPlace; break; @@ -1164,7 +1234,7 @@ void Node::updateConstantType() { const auto prevConstantType = constant; constant = isConst ? ConstantType::Const : ConstantType::NoConst; if (constant == prevConstantType) - return; // state has not changed, no reason to continue + return; // state has not changed, no reason to continue for (const auto& childEdge : getChildEdges()) { const auto childNode = childEdge.lock()->getChild(); @@ -1173,7 +1243,8 @@ void Node::updateConstantType() { } void Node::addOriginalLayer(const std::string& layerName) { - if (layerName.empty()) return; + if (layerName.empty()) + return; if (originalLayers.empty()) { originalLayers = layerName; } else { @@ -1196,46 +1267,25 @@ void Node::cleanup() { const std::vector& Node::getDefaultImplPriority() { static const std::vector priorities { impl_desc_type::unknown, - // Undef impl type is used to express use-cases there real type is unkown during compilation - // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties - impl_desc_type::undef, - impl_desc_type::brgconv_avx512_amx_1x1, - impl_desc_type::brgconv_avx512_amx, - impl_desc_type::jit_avx512_amx_dw, - impl_desc_type::jit_avx512_amx_1x1, - impl_desc_type::jit_avx512_amx, - // Brgconv kernels disabled in order to prevent perf degradations on non AMX HW - // impl_desc_type::brgconv_avx512_1x1, - // impl_desc_type::brgconv_avx512, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, + // Undef impl type is used to express use-cases there real type is unkown during compilation + // Undef has higher priority than defined types in order to force primitive selection logic to make decision + // based on other properties + impl_desc_type::undef, impl_desc_type::brgconv_avx512_amx_1x1, impl_desc_type::brgconv_avx512_amx, + impl_desc_type::jit_avx512_amx_dw, impl_desc_type::jit_avx512_amx_1x1, impl_desc_type::jit_avx512_amx, + // Brgconv kernels disabled in order to prevent perf degradations on non AMX HW + // impl_desc_type::brgconv_avx512_1x1, + // impl_desc_type::brgconv_avx512, + impl_desc_type::jit_uni_dw, impl_desc_type::jit_uni_1x1, impl_desc_type::jit_uni, + impl_desc_type::jit_avx512_dw, impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, + impl_desc_type::jit_avx2_dw, impl_desc_type::jit_avx2_1x1, impl_desc_type::jit_avx2, + impl_desc_type::jit_avx_dw, impl_desc_type::jit_avx_1x1, impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, impl_desc_type::jit_sse42_1x1, impl_desc_type::jit_sse42, #if defined(OPENVINO_ARCH_ARM64) - impl_desc_type::jit_asimd, + impl_desc_type::jit_asimd, #endif - impl_desc_type::gemm_any, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::gemm_acl, - impl_desc_type::acl, - impl_desc_type::jit_gemm, - impl_desc_type::ref_any, - impl_desc_type::ref, + impl_desc_type::gemm_any, impl_desc_type::gemm_blas, impl_desc_type::gemm_avx512, impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, impl_desc_type::gemm_acl, impl_desc_type::acl, + impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref, }; return priorities; @@ -1245,30 +1295,31 @@ const std::vector& Node::getImplPriority() { if (!customImplPriorities.empty()) return customImplPriorities; - return getDefaultImplPriority(); } -PortDescBasePtr Node::getConsistentInputDesc(const NodeConfig &config, size_t idx) const { +PortDescBasePtr Node::getConsistentInputDesc(const NodeConfig& config, size_t idx) const { const auto& inConf = config.inConfs[idx]; - if (inConf.inPlace() >= 0) { // node have inplace input + if (inConf.inPlace() >= 0) { // node have inplace input auto inplaceIndx = static_cast(inConf.inPlace()); PortDescBasePtr outPortDesc; const auto& outConf = config.outConfs[inplaceIndx]; - if (outConf.inPlace() == static_cast(idx)) { // the input desc port is the same port used for inplace output - outPortDesc = outConf.getPortDesc(); // just use desc from this output port + if (outConf.inPlace() == + static_cast(idx)) { // the input desc port is the same port used for inplace output + outPortDesc = outConf.getPortDesc(); // just use desc from this output port } else { - outPortDesc = getConsistentOutputDesc(config, inplaceIndx); // get consistent desc otherwise + outPortDesc = getConsistentOutputDesc(config, inplaceIndx); // get consistent desc otherwise } - if (inConf.getPortDesc()->isCompatible(*outPortDesc)) { // use the desc if compatible + if (inConf.getPortDesc()->isCompatible(*outPortDesc)) { // use the desc if compatible return outPortDesc; } } - auto *parentSelectedPD = getParentEdgeAt(idx)->getParent()->getSelectedPrimitiveDescriptor(); + auto* parentSelectedPD = getParentEdgeAt(idx)->getParent()->getSelectedPrimitiveDescriptor(); if (!parentSelectedPD) - OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", getParentEdgeAt(idx)->getParent()->getName()); + OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", + getParentEdgeAt(idx)->getParent()->getName()); int num = getParentEdgeAt(idx)->getInputNum(); if (num >= 0) { @@ -1289,26 +1340,28 @@ PortDescBasePtr Node::getConsistentInputDesc(const NodeConfig &config, size_t id return inConf.getPortDesc(); } -PortDescBasePtr Node::getConsistentOutputDesc(const NodeConfig &config, size_t idx) const { +PortDescBasePtr Node::getConsistentOutputDesc(const NodeConfig& config, size_t idx) const { const auto& outConf = config.outConfs[idx]; - if (outConf.inPlace() >= 0) { // node have inplace output + if (outConf.inPlace() >= 0) { // node have inplace output auto inplaceIndx = static_cast(outConf.inPlace()); PortDescBasePtr inpPortDesc; const auto& inpConf = config.inConfs[inplaceIndx]; - if (inpConf.inPlace() == static_cast(idx)) { // the input desc port is the same port used for inplace output - inpPortDesc = inpConf.getPortDesc(); // just use desc from this output port + if (inpConf.inPlace() == + static_cast(idx)) { // the input desc port is the same port used for inplace output + inpPortDesc = inpConf.getPortDesc(); // just use desc from this output port } else { - inpPortDesc = getConsistentInputDesc(config, inplaceIndx); // get consistent desc otherwise + inpPortDesc = getConsistentInputDesc(config, inplaceIndx); // get consistent desc otherwise } - if (outConf.getPortDesc()->isCompatible(*inpPortDesc)) { // use the desc if compatible + if (outConf.getPortDesc()->isCompatible(*inpPortDesc)) { // use the desc if compatible return inpPortDesc; } } - auto *childSelectedPD = getChildEdgeAt(idx)->getChild()->getSelectedPrimitiveDescriptor(); + auto* childSelectedPD = getChildEdgeAt(idx)->getChild()->getSelectedPrimitiveDescriptor(); if (!childSelectedPD) - OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", getChildEdgeAt(idx)->getChild()->getName()); + OPENVINO_THROW("Cannot get selected primitive descriptor for node: ", + getChildEdgeAt(idx)->getChild()->getName()); int num = getChildEdgeAt(idx)->getOutputNum(); if (num >= 0) { @@ -1330,7 +1383,7 @@ PortDescBasePtr Node::getConsistentOutputDesc(const NodeConfig &config, size_t i } void Node::initOptimalPrimitiveDescriptor() { - if (one_of(getType(), Type::RNNCell, Type::RNNSeq)) // can be skipped for RNN node + if (one_of(getType(), Type::RNNCell, Type::RNNSeq)) // can be skipped for RNN node return; auto selected_pd = getSelectedPrimitiveDescriptor(); @@ -1357,7 +1410,8 @@ void Node::initOptimalPrimitiveDescriptor() { // it is assumed that the nodes will define dense tensors on output edges // if it is not the case the implementation must redefine this behaviour if (outMemDesc->getType() & Blocked) { - config.outConfs[i].setMemDesc(std::dynamic_pointer_cast(outMemDesc), BlockedMemoryDesc::FULL_MASK); + config.outConfs[i].setMemDesc(std::dynamic_pointer_cast(outMemDesc), + BlockedMemoryDesc::FULL_MASK); } } } @@ -1365,9 +1419,9 @@ void Node::initOptimalPrimitiveDescriptor() { initDescriptor(config); } -bool Node::isConfigDefined(const NodeConfig &config) const { +bool Node::isConfigDefined(const NodeConfig& config) const { for (const auto& configs : {config.inConfs, config.outConfs}) { - for (const auto &dc : configs) { + for (const auto& dc : configs) { if (!dc.getMemDesc()->isDefined()) return false; } @@ -1375,14 +1429,14 @@ bool Node::isConfigDefined(const NodeConfig &config) const { return true; } -MemoryDescPtr Node::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +MemoryDescPtr Node::getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (getInputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(prim_desc.src_desc(idx), getInputShapeAtPort(idx)); } return DnnlExtensionUtils::makeDescriptor(prim_desc.src_desc(idx)); } -MemoryDescPtr Node::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +MemoryDescPtr Node::getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (getOutputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(prim_desc.dst_desc(idx), getOutputShapeAtPort(idx)); } @@ -1392,7 +1446,7 @@ MemoryDescPtr Node::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t void Node::appendPostOpArgs(const dnnl::primitive_attr& attr, std::unordered_map& primArgs, const std::unordered_map& postOpsArgs) { - for (auto & entry : postOpsArgs) { + for (auto& entry : postOpsArgs) { primArgs[entry.first] = entry.second->getPrimitive(); } } @@ -1425,11 +1479,17 @@ dnnl::memory::format_tag Node::getWeightsFormatTagByDims(const VectorDims& dims) } } -void Node::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis) { +void Node::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis) { OPENVINO_THROW("Fusing of ", NameFromType(this->getType()), " operation is not implemented"); } -void Node::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis) { +void Node::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis) { OPENVINO_THROW("Fusing of ", NameFromType(this->getType()), " operation is not implemented"); } @@ -1473,12 +1533,12 @@ ov::element::Type Node::getRuntimePrecision() const { } Node* Node::NodesFactory::create(const std::shared_ptr& op, const GraphContext::CPtr context) { - // getExceptionDescWithoutStatus removes redundant information from the exception message. For instance, the NotImplemented - // exception is generated in the form: full_path_to_src_file:line_number [ NOT_IMPLEMENTED ] reason. + // getExceptionDescWithoutStatus removes redundant information from the exception message. For instance, the + // NotImplemented exception is generated in the form: full_path_to_src_file:line_number [ NOT_IMPLEMENTED ] reason. // An example for gather node: - // /path-to-openVino-root/src/plugins/intel_cpu/nodes/gather.cpp:42 [ NOT_IMPLEMENTED ] Only opset7 Gather operation is supported - // The most important part of the message is the reason, so the lambda trims everything up to "]" - // Note that the op type and its friendly name will also be provided if we fail to create the node. + // /path-to-openVino-root/src/plugins/intel_cpu/nodes/gather.cpp:42 [ NOT_IMPLEMENTED ] Only opset7 Gather operation + // is supported The most important part of the message is the reason, so the lambda trims everything up to "]" Note + // that the op type and its friendly name will also be provided if we fail to create the node. auto getExceptionDescWithoutStatus = [](const ov::Exception& ex) { std::string desc = ex.what(); size_t pos = desc.find(']'); @@ -1491,7 +1551,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr& op, const Grap } return desc; }; - Node *newNode = nullptr; + Node* newNode = nullptr; std::string errorMessage; if (newNode == nullptr) { try { @@ -1538,7 +1598,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr& op, const Grap return newNode; } -bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { +bool Node::canBePerformedAsScaleShift(const Node* parentNode) const { #if defined(OPENVINO_ARCH_X86_64) OPENVINO_ASSERT(parentNode); @@ -1546,7 +1606,7 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { const auto channelAxis = parentNode->getFusingAxis(); for (size_t i = 0; i < getParentEdges().size(); i++) { - Node *node = getParentEdgeAt(i)->getParent().get(); + Node* node = getParentEdgeAt(i)->getParent().get(); if (node == nullptr) { OPENVINO_THROW("Cannot get parent node for ", getName(), " on ", i, " port"); } @@ -1574,7 +1634,7 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { const auto isConvertablePowerStatic = [&]() { if (getAlgorithm() == Algorithm::EltwisePowerStatic) { - const auto eltwise = dynamic_cast(this); + const auto eltwise = dynamic_cast(this); if (!eltwise) { OPENVINO_THROW("Cannot cast ", getName(), " to Eltwise"); } @@ -1583,13 +1643,15 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { return false; }; - return (one_of(getAlgorithm(), Algorithm::EltwiseAdd, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseDivide, - Algorithm::EltwisePrelu, - Algorithm::EltwiseMulAdd) && isBroadcastableToDataInput()) - || isConvertablePowerStatic(); + return (one_of(getAlgorithm(), + Algorithm::EltwiseAdd, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseDivide, + Algorithm::EltwisePrelu, + Algorithm::EltwiseMulAdd) && + isBroadcastableToDataInput()) || + isConvertablePowerStatic(); #else // TODO: provide correct list of operations for other backends return false; @@ -1599,11 +1661,11 @@ bool Node::canBePerformedAsScaleShift(const Node *parentNode) const { // @todo shifts for Subtract and scales for Divide are replaced with // Add (with opposite sign) and Multiply (with inverse value) for legacy dephwise post ops // This can be avoided after dephwise post ops are gone -std::pair, std::vector> Node::getScalesAndShifts(const Node *parentNode) const { +std::pair, std::vector> Node::getScalesAndShifts(const Node* parentNode) const { std::vector scales, shifts; const auto fillValuesFrom = [&](const NodePtr& constInput, std::vector& buffer) { - auto *constInputNode = dynamic_cast(constInput.get()); + auto* constInputNode = dynamic_cast(constInput.get()); if (!constInputNode) { OPENVINO_THROW("Cannot cast ", constInput->getName(), " to Input"); } @@ -1627,7 +1689,7 @@ std::pair, std::vector> Node::getScalesAndShifts(const fillValuesFrom(getParentEdgeAt(1)->getParent(), scales); fillValuesFrom(getParentEdgeAt(2)->getParent(), shifts); } else if (one_of(getAlgorithm(), Algorithm::EltwisePowerStatic)) { - const auto power = dynamic_cast(this); + const auto power = dynamic_cast(this); if (!power) { OPENVINO_THROW("Cannot cast ", getName(), " to Eltwise"); } @@ -1638,25 +1700,30 @@ std::pair, std::vector> Node::getScalesAndShifts(const } switch (getAlgorithm()) { - case Algorithm::EltwiseAdd: { - scales.resize(shifts.size(), 1.0f); - break; - } - case Algorithm::EltwiseSubtract: { - scales.resize(shifts.size(), 1.0f); - std::transform(shifts.begin(), shifts.end(), shifts.begin(), [](float shift){ return -1.0f * shift; }); - break; - } - case Algorithm::EltwiseMultiply: { - shifts.resize(scales.size(), 0.0f); - break; - } - case Algorithm::EltwiseDivide: { - shifts.resize(scales.size(), 0.0f); - std::transform(scales.begin(), scales.end(), scales.begin(), [](float scale){ return 1.0f / scale; }); - break; - } - default: break; + case Algorithm::EltwiseAdd: { + scales.resize(shifts.size(), 1.0f); + break; + } + case Algorithm::EltwiseSubtract: { + scales.resize(shifts.size(), 1.0f); + std::transform(shifts.begin(), shifts.end(), shifts.begin(), [](float shift) { + return -1.0f * shift; + }); + break; + } + case Algorithm::EltwiseMultiply: { + shifts.resize(scales.size(), 0.0f); + break; + } + case Algorithm::EltwiseDivide: { + shifts.resize(scales.size(), 0.0f); + std::transform(scales.begin(), scales.end(), scales.begin(), [](float scale) { + return 1.0f / scale; + }); + break; + } + default: + break; } return {scales, shifts}; @@ -1673,7 +1740,7 @@ bool Node::isInputTensorAtPortEmpty(size_t port) const { auto edge = getParentEdgeAt(port); if (one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::Validated)) { auto&& mem = edge->getMemory(); - if (mem.isDefined()) { + if (mem.isDefined() && !mem.getDesc().empty()) { return mem.getShape().hasZeroDims(); } } @@ -1823,22 +1890,25 @@ bool Node::canFuseSimpleOperation(const NodePtr& node) const { return ret; } else if (node->getType() == Type::Eltwise) { return DnnlExtensionUtils::isUnarySupportedAsPostOp(node->getAlgorithm()) || - node->canBePerformedAsScaleShift(this); + node->canBePerformedAsScaleShift(this); } return false; } -void Node::addFusedNode(const NodePtr &fusingNode) { +void Node::addFusedNode(const NodePtr& fusingNode) { fusedWith.push_back(fusingNode); } void Node::addSupportedPrimDesc(const std::vector& inPortConfigs, const std::vector& outPortConfigs, impl_desc_type implType) { - auto fill_port = [] (const PortConfigurator& portConfigurator, const Shape& shape, - ov::element::Type prc, std::vector& port) -> bool { - // In order to simplify particular node initialization logic we just don't add config in case target shape is not supported by blockedDescCreator. - // This should be suitable for major of scenarios since almost all nodes add `ncsp` blockedDescCreator which supports any shape rank. + auto fill_port = [](const PortConfigurator& portConfigurator, + const Shape& shape, + ov::element::Type prc, + std::vector& port) -> bool { + // In order to simplify particular node initialization logic we just don't add config in case target shape is + // not supported by blockedDescCreator. This should be suitable for major of scenarios since almost all nodes + // add `ncsp` blockedDescCreator which supports any shape rank. if (shape.getRank() < portConfigurator.blockedDescCreator->getMinimalRank()) return false; @@ -1855,14 +1925,16 @@ void Node::addSupportedPrimDesc(const std::vector& inPortConfi NodeConfig config; for (size_t i = 0; i < inPortConfigs.size(); i++) { auto shape = inPortConfigs[i].shape.getRank() == 0 ? getInputShapeAtPort(i) : inPortConfigs[i].shape; - auto prc = inPortConfigs[i].prc == ov::element::undefined ? getOriginalInputPrecisionAtPort(i) : inPortConfigs[i].prc; + auto prc = + inPortConfigs[i].prc == ov::element::undefined ? getOriginalInputPrecisionAtPort(i) : inPortConfigs[i].prc; if (!fill_port(inPortConfigs[i], shape, prc, config.inConfs)) return; } for (size_t i = 0; i < outPortConfigs.size(); i++) { auto dims = outPortConfigs[i].shape.getRank() == 0 ? getOutputShapeAtPort(i) : outPortConfigs[i].shape; - auto prc = outPortConfigs[i].prc == ov::element::undefined ? getOriginalOutputPrecisionAtPort(i) : outPortConfigs[i].prc; + auto prc = outPortConfigs[i].prc == ov::element::undefined ? getOriginalOutputPrecisionAtPort(i) + : outPortConfigs[i].prc; if (!fill_port(outPortConfigs[i], dims, prc, config.outConfs)) return; } @@ -1883,23 +1955,27 @@ void Node::fuseDQScales(const float* scaleData, const size_t scaleSize) { if (scaleSize > DQScales.size()) DQScales.resize(scaleSize, DQScales[0]); if (1 == scaleSize) { - std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val){ return (scaleData[0] * val); }); - } else { - for (size_t i = 0; i < DQScales.size(); i++) { - DQScales[i] *= scaleData[i]; - } - } - if (std::all_of(DQScales.begin(), DQScales.end(), [OV_CAPTURE_CPY_AND_THIS](float val){ return (val == DQScales[0]);})) + std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) { + return (scaleData[0] * val); + }); + } else { + for (size_t i = 0; i < DQScales.size(); i++) { + DQScales[i] *= scaleData[i]; + } + } + if (std::all_of(DQScales.begin(), DQScales.end(), [OV_CAPTURE_CPY_AND_THIS](float val) { + return (val == DQScales[0]); + })) DQScales.resize(1); } int Node::inPlaceInputPort(int portIdx) const { if (inputShapes.empty()) { - //special case - a dead end node + // special case - a dead end node return -1; } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (!selected_pd) OPENVINO_THROW("Cannot find selected primitive descriptor for node: ", getName()); @@ -1917,11 +1993,11 @@ int Node::inPlaceInputPort(int portIdx) const { int Node::inPlaceOutPort(int portIdx) const { if (outputShapes.empty()) { - //special case - a dead end node + // special case - a dead end node return -1; } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (!selected_pd) OPENVINO_THROW("Cannot find selected primitive descriptor for node: ", getName()); @@ -1938,8 +2014,8 @@ int Node::inPlaceOutPort(int portIdx) const { } void Node::resolveInPlaceDirection() { - enum InplaceDirectionType {UP, DOWN, CYCLIC, NONE}; - enum PortType {INPUT, OUTPUT}; + enum InplaceDirectionType { UP, DOWN, CYCLIC, NONE }; + enum PortType { INPUT, OUTPUT }; auto inPlaceDirection = [](const Node* node, PortType portType, int portNum) -> InplaceDirectionType { if (PortType::INPUT == portType) { @@ -1989,7 +2065,8 @@ void Node::resolveInPlaceDirection() { if (auto pEdge = wEdge.lock()) { auto inpPort = pEdge->getOutputNum(); auto inPlaceInpPort = inPlaceInputPort(inpPort); - if (inPlaceInpPort < 0 || inPlaceDirection(this, PortType::INPUT, inpPort) != InplaceDirectionType::CYCLIC) { + if (inPlaceInpPort < 0 || + inPlaceDirection(this, PortType::INPUT, inpPort) != InplaceDirectionType::CYCLIC) { continue; } // inPlace memory cyclic dependency detected, need to resolve @@ -2001,12 +2078,14 @@ void Node::resolveInPlaceDirection() { config.inConfs[inpPort].inPlace(-1); initDescriptor(config); } else if (parentInPlaceDirection == InplaceDirectionType::DOWN) { - //search if siblings already have downstream direction + // search if siblings already have downstream direction auto downstreamPeers = [&] { for (auto& peerEdge : pParent->getChildEdgesAtPort(pEdge->getInputNum())) { auto peerNode = peerEdge->getChild().get(); - if (peerNode == this) continue; - if (inPlaceDirection(peerNode, PortType::INPUT, peerEdge->getOutputNum()) == InplaceDirectionType::DOWN) { + if (peerNode == this) + continue; + if (inPlaceDirection(peerNode, PortType::INPUT, peerEdge->getOutputNum()) == + InplaceDirectionType::DOWN) { return true; } } @@ -2067,7 +2146,8 @@ void Node::resolveInPlaceDirection() { // note: there are only non-inplace or cyclic-inplace descendants at the moment. std::function searchReferencingOutput; searchReferencingOutput = [&](const Node* node, int portIdx) -> void { - if (numConflicts > 1) return; // early stop + if (numConflicts > 1) + return; // early stop auto childEdges = node->getChildEdgesAtPort(portIdx); for (auto& edge : childEdges) { auto pChild = edge->getChild().get(); @@ -2076,7 +2156,8 @@ void Node::resolveInPlaceDirection() { } else { auto result = inPlaceDirection(pChild, PortType::INPUT, edge->getOutputNum()); if (InplaceDirectionType::CYCLIC == result) { - return searchReferencingOutput(pChild, pChild->inPlaceInputPort(edge->getOutputNum())); + return searchReferencingOutput(pChild, + pChild->inPlaceInputPort(edge->getOutputNum())); } } } @@ -2089,7 +2170,8 @@ void Node::resolveInPlaceDirection() { // note: the parent node does not use inPlace memory at the moment, let's check the siblings for (auto& peerEdge : pParent->getChildEdgesAtPort(pEdge->getInputNum())) { auto peerNode = peerEdge->getChild().get(); - if (peerNode == this) continue; + if (peerNode == this) + continue; if (Type::Output == peerNode->getType()) { numConflicts++; } else { @@ -2101,11 +2183,11 @@ void Node::resolveInPlaceDirection() { } } - if (numConflicts == 1) { // downstream to make the only output edge be referenced. + if (numConflicts == 1) { // downstream to make the only output edge be referenced. auto config = getSelectedPrimitiveDescriptor()->getConfig(); config.outConfs[inPlaceInpPort].inPlace(-1); initDescriptor(config); - } else { // the default direction of upstream + } else { // the default direction of upstream auto config = getSelectedPrimitiveDescriptor()->getConfig(); config.inConfs[inpPort].inPlace(-1); initDescriptor(config); @@ -2120,8 +2202,7 @@ void Node::resolveInPlaceDirection() { #ifndef CPU_DEBUG_CAPS std::ostream& operator<<(std::ostream& out, const Node& node) { - return out << "Node " << node.getName() << - " of type " << node.getTypeStr() << "\n"; + return out << "Node " << node.getName() << " of type " << node.getTypeStr() << "\n"; } std::ostream& operator<<(std::ostream& out, const Node* node) { @@ -2129,5 +2210,5 @@ std::ostream& operator<<(std::ostream& out, const Node* node) { } #endif -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 948bd6999ce27a..9166e87dbf50e1 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -4,37 +4,38 @@ #pragma once +#include + #include +#include #include +#include +#include +#include +#include + #include "cpu_memory.h" #include "cpu_shape.h" #include "cpu_types.h" #include "edge.h" +#include "graph_context.h" #include "memory_desc/cpu_memory_desc.h" -#include "selective_build.h" #include "memory_desc/dnnl_memory_desc.h" +#include "nodes/executors/executor.hpp" +#include "nodes/node_config.h" #include "onednn/dnnl.h" #include "onednn/iml_type_mapper.h" -#include #include "openvino/cc/factory.h" #include "openvino/core/node.hpp" -#include -#include "nodes/node_config.h" -#include #include "perf_count.h" -#include "utils/debug_capabilities.h" +#include "selective_build.h" #include "utils/bit_util.hpp" #include "utils/debug_capabilities.h" -#include "graph_context.h" -#include "nodes/executors/executor.hpp" - -#include -#include -#include - -#define THROW_CPU_NODE_ERR(...) OPENVINO_THROW("[CPU] ", getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) -#define CPU_NODE_ASSERT(condition, ...) OPENVINO_ASSERT(condition, getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) +#define THROW_CPU_NODE_ERR(...) \ + OPENVINO_THROW("[CPU] ", getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) +#define CPU_NODE_ASSERT(condition, ...) \ + OPENVINO_ASSERT(condition, getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) namespace ov { namespace intel_cpu { @@ -45,13 +46,25 @@ using NodeWeakPtr = std::weak_ptr; class PortConfigurator { public: - PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, ov::element::Type prc, const Shape& shape, - bool constant = false, int inPlace = -1) : - blockedDescCreator(getBlockedDescCreator(blockedDescType)), prc(prc), shape(shape), constant(constant), inPlace(inPlace) {} - - PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, ov::element::Type prc = ov::element::undefined, - bool constant = false, int inPlace = -1) : - blockedDescCreator(getBlockedDescCreator(blockedDescType)), prc(prc), constant(constant), inPlace(inPlace) {} + PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, + ov::element::Type prc, + const Shape& shape, + bool constant = false, + int inPlace = -1) + : blockedDescCreator(getBlockedDescCreator(blockedDescType)), + prc(prc), + shape(shape), + constant(constant), + inPlace(inPlace) {} + + PortConfigurator(ov::intel_cpu::LayoutType blockedDescType, + ov::element::Type prc = ov::element::undefined, + bool constant = false, + int inPlace = -1) + : blockedDescCreator(getBlockedDescCreator(blockedDescType)), + prc(prc), + constant(constant), + inPlace(inPlace) {} ov::intel_cpu::BlockedDescCreator::CreatorConstPtr blockedDescCreator; const ov::element::Type prc; @@ -60,7 +73,8 @@ class PortConfigurator { int inPlace = -1; private: - static ov::intel_cpu::BlockedDescCreator::CreatorConstPtr getBlockedDescCreator(ov::intel_cpu::LayoutType blockedDescType) { + static ov::intel_cpu::BlockedDescCreator::CreatorConstPtr getBlockedDescCreator( + ov::intel_cpu::LayoutType blockedDescType) { auto& creators = ov::intel_cpu::BlockedDescCreator::getCommonCreators(); if (creators.find(blockedDescType) == creators.end()) { OPENVINO_THROW("Cannot find tensor descriptor creator"); @@ -71,11 +85,15 @@ class PortConfigurator { class NodeDesc { public: - NodeDesc(NodeConfig conf, impl_desc_type type): - config(std::move(conf)), implementationType(type), executorFactory(nullptr) {} + NodeDesc(NodeConfig conf, impl_desc_type type) + : config(std::move(conf)), + implementationType(type), + executorFactory(nullptr) {} - NodeDesc(NodeConfig conf, impl_desc_type type, ExecutorFactoryLegacyPtr factory): - config(std::move(conf)), implementationType(type), executorFactory(factory) {} + NodeDesc(NodeConfig conf, impl_desc_type type, ExecutorFactoryLegacyPtr factory) + : config(std::move(conf)), + implementationType(type), + executorFactory(factory) {} const NodeConfig& getConfig() const { return config; @@ -98,8 +116,8 @@ class NodeDesc { } template ::value && !std::is_reference::value, int>::type = 0, - typename std::enable_if::value, int>::type = 0> + typename std::enable_if::value && !std::is_reference::value, int>::type = 0, + typename std::enable_if::value, int>::type = 0> std::shared_ptr getExecutorFactoryAs() { auto casted = std::dynamic_pointer_cast(executorFactory); if (!casted) @@ -119,34 +137,41 @@ class NodeDesc { class Node { public: - Node(const Node &) = delete; - Node & operator = (const Node &) = delete; + Node(const Node&) = delete; + Node& operator=(const Node&) = delete; using AttrPtr = std::shared_ptr; public: - template + template struct Tag {}; struct PerfCounters { PerfCounters(std::string const& name) - : execute(openvino::itt::handle(name)) - , getSupportedDescriptors(openvino::itt::handle>("Node::getSupportedDescriptors")) - , initSupportedPrimitiveDescriptors(openvino::itt::handle>("Node::initSupportedPrimitiveDescriptors")) - , filterSupportedPrimitiveDescriptors(openvino::itt::handle>("Node::filterSupportedPrimitiveDescriptors")) - , selectOptimalPrimitiveDescriptor(openvino::itt::handle>("Node::selectOptimalPrimitiveDescriptor")) - , createPrimitive(openvino::itt::handle>("Node::createPrimitive")) - , initOptimalPrimitiveDescriptor(openvino::itt::handle>("Node::initOptimalPrimitiveDescriptor")) - {} - - template + : execute(openvino::itt::handle(name)), + getSupportedDescriptors(openvino::itt::handle>("Node::getSupportedDescriptors")), + initSupportedPrimitiveDescriptors( + openvino::itt::handle>("Node::initSupportedPrimitiveDescriptors")), + filterSupportedPrimitiveDescriptors( + openvino::itt::handle>("Node::filterSupportedPrimitiveDescriptors")), + selectOptimalPrimitiveDescriptor( + openvino::itt::handle>("Node::selectOptimalPrimitiveDescriptor")), + createPrimitive(openvino::itt::handle>("Node::createPrimitive")), + initOptimalPrimitiveDescriptor( + openvino::itt::handle>("Node::initOptimalPrimitiveDescriptor")) {} + + template void buildClassCounters(const std::string& type_name) { getSupportedDescriptors = openvino::itt::handle>(type_name + "::getSupportedDescriptors"); - initSupportedPrimitiveDescriptors = openvino::itt::handle>(type_name + "::initSupportedPrimitiveDescriptors"); - filterSupportedPrimitiveDescriptors = openvino::itt::handle>(type_name + "::filterSupportedPrimitiveDescriptors"); - selectOptimalPrimitiveDescriptor = openvino::itt::handle>(type_name + "::selectOptimalPrimitiveDescriptor"); + initSupportedPrimitiveDescriptors = + openvino::itt::handle>(type_name + "::initSupportedPrimitiveDescriptors"); + filterSupportedPrimitiveDescriptors = + openvino::itt::handle>(type_name + "::filterSupportedPrimitiveDescriptors"); + selectOptimalPrimitiveDescriptor = + openvino::itt::handle>(type_name + "::selectOptimalPrimitiveDescriptor"); createPrimitive = openvino::itt::handle>(type_name + "::createPrimitive"); - initOptimalPrimitiveDescriptor = openvino::itt::handle>(type_name + "::initOptimalPrimitiveDescriptor"); + initOptimalPrimitiveDescriptor = + openvino::itt::handle>(type_name + "::initOptimalPrimitiveDescriptor"); } openvino::itt::handle_t execute; @@ -159,7 +184,7 @@ class Node { }; class NodesFactory; - static NodesFactory & factory(); + static NodesFactory& factory(); virtual ~Node() = default; @@ -171,11 +196,12 @@ class Node { void remove(); void addParentEdge(const EdgePtr& edge) { - assert(std::none_of(parentEdges.begin(), parentEdges.end(), - [&edge](const EdgeWeakPtr& _edge){ - return _edge.lock()->getOutputNum() == edge->getOutputNum(); - })); - parentEdges.insert(std::upper_bound(parentEdges.begin(), parentEdges.end(), edge, + assert(std::none_of(parentEdges.begin(), parentEdges.end(), [&edge](const EdgeWeakPtr& _edge) { + return _edge.lock()->getOutputNum() == edge->getOutputNum(); + })); + parentEdges.insert(std::upper_bound(parentEdges.begin(), + parentEdges.end(), + edge, [](const EdgeWeakPtr& lhs, const EdgeWeakPtr& rhs) { return lhs.lock()->getOutputNum() < rhs.lock()->getOutputNum(); }), @@ -196,11 +222,11 @@ class Node { removeEdge(edge, childEdges); } - const std::vector &getParentEdges() const noexcept { + const std::vector& getParentEdges() const noexcept { return parentEdges; } - const std::vector &getChildEdges() const noexcept { + const std::vector& getChildEdges() const noexcept { return childEdges; } @@ -238,7 +264,7 @@ class Node { return getSrcMemoryAtPort(idx)->getData(); } - template + template T* getSrcDataAtPortAs(size_t idx) const { return getSrcMemoryAtPort(idx)->getDataAs(); } @@ -247,7 +273,7 @@ class Node { return getDstMemoryAtPort(idx)->getData(); } - template + template T* getDstDataAtPortAs(size_t idx) const { return getDstMemoryAtPort(idx)->getDataAs(); } @@ -273,7 +299,8 @@ class Node { enum class ConstantType { Const, // Node is placed in a constant subgraph NoConst, // Node is placed in a non-constant subgraph - StrictNoConst, // Node produces non-constant subgraph: this type can't be changed and it does not depend on the parent nodes' ConstantType. + StrictNoConst, // Node produces non-constant subgraph: this type can't be changed and it does not depend on the + // parent nodes' ConstantType. }; ConstantType getConstantType() const; void updateConstantType(); @@ -290,10 +317,11 @@ class Node { bool isFusedWith(Type type) const; - virtual void addFusedNode(const NodePtr &fusingNode); + virtual void addFusedNode(const NodePtr& fusingNode); virtual void fuseInto(NodePtr& parentNode) { - // The graph supports fusing only of consecutive nodes and some graph logic requires to know through which input port a node was fused into parent one. + // The graph supports fusing only of consecutive nodes and some graph logic requires to know through which input + // port a node was fused into parent one. for (size_t i = 0; i < getParentEdges().size(); i++) { if (getParentEdgeAt(i)->getParent().get() == parentNode.get()) { setFusingPort(i); @@ -323,15 +351,15 @@ class Node { fusedWith.clear(); } - void mergeWith(const NodePtr &merge) { + void mergeWith(const NodePtr& merge) { mergedWith.push_back(merge); } - const std::vector &getMergeWith() { + const std::vector& getMergeWith() { return mergedWith; } - const std::vector &getFusedWith() { + const std::vector& getFusedWith() { return fusedWith; } @@ -343,17 +371,17 @@ class Node { this->fusingPort = fusingPort; } - const std::string &getName() const { + const std::string& getName() const { return name; } void addOriginalLayer(const std::string& layerName); - const std::string &getOriginalLayers() const { + const std::string& getOriginalLayers() const { return originalLayers; } - const std::string &getParallelDomain() const { + const std::string& getParallelDomain() const { return parallelDomain; } @@ -437,7 +465,9 @@ class Node { virtual std::string getPrimitiveDescriptorType() const; - PerfCount &PerfCounter() { return perfCounter; } + PerfCount& PerfCounter() { + return perfCounter; + } virtual void resolveInPlaceEdges(Edge::LOOK look = Edge::LOOK_BOTH); @@ -448,7 +478,7 @@ class Node { void updateShapes(); void updateDynamicParams(); void executeDynamic(dnnl::stream strm, int numaId = -1); - virtual void redefineOutputMemory(const std::vector &newShapes); + virtual void redefineOutputMemory(const std::vector& newShapes); void redefineOutputMemory(const size_t port, const VectorDims& new_output_shape); bool outputShapeDataDependency() const; @@ -475,7 +505,8 @@ class Node { /** * @brief Performs Node initialization based on graph context. - * This is an auxiliary method that allows to use information not available in Node constructor (e.g. connection information with other nodes) + * This is an auxiliary method that allows to use information not available in Node constructor (e.g. connection + * information with other nodes) */ virtual void init() {} @@ -483,11 +514,11 @@ class Node { return execIndex; } - const std::string & getTypeStr() const { + const std::string& getTypeStr() const { return typeStr; } - void setTypeStr(const std::string &typeStr) { + void setTypeStr(const std::string& typeStr) { this->typeStr = typeStr; } @@ -499,11 +530,11 @@ class Node { return 1; } - const PerfCounters & perfCounters() const { + const PerfCounters& perfCounters() const { return profiling; } - PerfCounters & perfCounters() { + PerfCounters& perfCounters() { return profiling; } @@ -588,7 +619,7 @@ class Node { return false; } - bool canBePerformedAsScaleShift(const Node *parentNode = nullptr) const; + bool canBePerformedAsScaleShift(const Node* parentNode = nullptr) const; bool isDynamicNode() const { return isDynamic; @@ -613,14 +644,14 @@ class Node { } /** - * @brief Return scales and shift if nodes can be executed as ScaleShift, else raise exception - * If node has only scale or shift value, fill missing value with default values - * i.e. EltwiseAdd: fill shifts from constant, fill scales with default values = 1.0f - * @param parentNode - * node from which data comes - * @return pair of scales and shifts - */ - std::pair, std::vector> getScalesAndShifts(const Node *parentNode) const; + * @brief Return scales and shift if nodes can be executed as ScaleShift, else raise exception + * If node has only scale or shift value, fill missing value with default values + * i.e. EltwiseAdd: fill shifts from constant, fill scales with default values = 1.0f + * @param parentNode + * node from which data comes + * @return pair of scales and shifts + */ + std::pair, std::vector> getScalesAndShifts(const Node* parentNode) const; void fuseDQScales(const float* scaleData, const size_t scaleSize); const std::vector& getDQScales() const { @@ -631,8 +662,14 @@ class Node { * Seed node should call this routine and pass its post operations list as parameter. * @param ops List of fused post operations */ - virtual void appendPostOps(dnnl::post_ops& ops, const VectorDims& postOpDims, std::unordered_map& postOpsMem, const int channelAxis = 1); - virtual void appendPostOps(dnnl::post_ops& ops, const VectorDims& postOpDims, std::vector& postOpsMem, const int channelAxis = 1); + virtual void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis = 1); + virtual void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1); virtual bool canBeExecutedInInt8() const { OPENVINO_THROW_NOT_IMPLEMENTED("canBeExecutedInInt8 not implemented for node with type ", NameFromType(getType())); @@ -649,22 +686,24 @@ class Node { this->type = type; } - virtual PortDescBasePtr getConsistentInputDesc(const NodeConfig &config, size_t idx) const; - virtual PortDescBasePtr getConsistentOutputDesc(const NodeConfig &config, size_t idx) const; - virtual MemoryDescPtr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const; - virtual MemoryDescPtr getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const; + virtual PortDescBasePtr getConsistentInputDesc(const NodeConfig& config, size_t idx) const; + virtual PortDescBasePtr getConsistentOutputDesc(const NodeConfig& config, size_t idx) const; + virtual MemoryDescPtr getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const; + virtual MemoryDescPtr getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const; - virtual AttrPtr initPrimitiveAttr() { return nullptr; } + virtual AttrPtr initPrimitiveAttr() { + return nullptr; + } - typedef std::function - GetPrimitiveMemoryFormatFunc; + typedef std::function + GetPrimitiveMemoryFormatFunc; std::vector internalBlobDesc; std::vector inputShapes; std::vector outputShapes; - std::vector fusedWith; - std::vector mergedWith; + std::vector fusedWith; + std::vector mergedWith; int curNumaNode = -1; @@ -672,11 +711,11 @@ class Node { virtual void toNumaNodeImpl(int numaID); std::string primitivesPriority; - std::vector customImplPriorities; - std::vector inputMemoryFormatsFilter; - std::vector outputMemoryFormatsFilter; + std::vector customImplPriorities; + std::vector inputMemoryFormatsFilter; + std::vector outputMemoryFormatsFilter; bool enforceBF16evenForGraphTail = false; - bool keepOriginalPrecision = false; + bool keepOriginalPrecision = false; std::string originalLayers; // contains names of the original layers separated by comma std::string parallelDomain; @@ -692,11 +731,7 @@ class Node { int selectedPrimitiveDescriptorIndex = -1; - enum class InPlaceType { - Unknown, - InPlace, - NoInPlace - }; + enum class InPlaceType { Unknown, InPlace, NoInPlace }; mutable InPlaceType inplace = InPlaceType::Unknown; ConstantType constant = ConstantType::NoConst; std::vector internalBlobs; @@ -718,7 +753,7 @@ class Node { void selectPreferPrimitiveDescriptorWithShape(const std::vector& priority, bool ignoreConstInputs); bool isOneDimShape(const ov::PartialShape& pshape); bool isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2); - bool isConfigDefined(const NodeConfig &config) const; + bool isConfigDefined(const NodeConfig& config) const; virtual bool canBeInPlace() const; /* returns default implementaion prioirity */ @@ -733,13 +768,15 @@ class Node { /** * @brief Auxiliary function to get node input precisions - * @return Vector of precisions based on information from node input edges. Return empty vector in case edges are not initialized yet. + * @return Vector of precisions based on information from node input edges. Return empty vector in case edges are + * not initialized yet. */ virtual std::vector getInputPrecisions() const; /** * @brief Auxiliary function to get node output precisions - * @return Vector of precisions based on information from node output edges. Return empty vector in case edges are not initialized yet. + * @return Vector of precisions based on information from node output edges. Return empty vector in case edges are + * not initialized yet. */ virtual std::vector getOutputPrecisions() const; @@ -788,7 +825,7 @@ class Node { MemoryPtr getScratchPadMem(const MemoryDescPtr& desc) { if (!scratchpadMem || !scratchpadMem->getDesc().isCompatible(*desc)) { - scratchpadMem = context->getScratchPad(curNumaNode)->createScratchPadMem(desc); + scratchpadMem = context->getScratchPad()->createScratchPadMem(desc); } return scratchpadMem; } @@ -803,13 +840,14 @@ class Node { // is still under control of strong references outside of cache. // privateWeightCache is for holding strong references to constant weight // copies of same content with different layouts. - std::shared_ptr> privateWeightCache - = std::make_shared>(); + std::shared_ptr> privateWeightCache = + std::make_shared>(); private: - static void removeEdge(const EdgePtr edge, std::vector &edges) { - edges.erase(std::remove_if(edges.begin(), edges.end(), - [&edge] (EdgeWeakPtr _edge) { + static void removeEdge(const EdgePtr edge, std::vector& edges) { + edges.erase(std::remove_if(edges.begin(), + edges.end(), + [&edge](EdgeWeakPtr _edge) { return _edge.lock() == edge; }), edges.end()); @@ -856,22 +894,20 @@ constexpr uint64_t PortMask(T... rest) { return util::bit::mask(rest...); } -class Node::NodesFactory : public openvino::cc::Factory& op, - const GraphContext::CPtr)> { +class Node::NodesFactory + : public openvino::cc::Factory& op, const GraphContext::CPtr)> { public: NodesFactory(); Node* create(const std::shared_ptr& op, const GraphContext::CPtr context); }; -template +template struct NodeImpl : public NodeType { - NodeImpl(const std::shared_ptr& op, const GraphContext::CPtr context) - : NodeType(op, context) { + NodeImpl(const std::shared_ptr& op, const GraphContext::CPtr context) : NodeType(op, context) { NodeType::perfCounters().template buildClassCounters(NameFromType(NodeType::getType())); } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp index f4e7f6217a8dec..f4af11b0f2362a 100644 --- a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.cpp @@ -3,18 +3,21 @@ // #include "adaptive_pooling.h" -#include "openvino/core/parallel.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" + #include -#include "onednn/dnnl.h" -#include "dnnl_extension_utils.h" -#include "selective_build.h" + #include #include #include -#include "utils/general_utils.h" #include + +#include "cpu/x64/cpu_isa_traits.hpp" +#include "dnnl_extension_utils.h" +#include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" +#include "selective_build.h" #include "shape_inference/custom/adaptive_pooling.hpp" +#include "utils/general_utils.h" using namespace dnnl; using namespace dnnl::impl::cpu::x64; @@ -23,7 +26,8 @@ namespace ov { namespace intel_cpu { namespace node { -bool AdaptivePooling::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool AdaptivePooling::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (one_of(op->get_type_info(), ov::op::v8::AdaptiveAvgPool::get_type_info_static())) { auto adaPool = std::dynamic_pointer_cast(op); @@ -51,9 +55,9 @@ AdaptivePooling::AdaptivePooling(const std::shared_ptr& op, const Grap : Node(op, context, AdaptivePoolingShapeInferFactory(op)) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { - errorPrefix = "Adaptive Pooling layer with name '" + getName() + "' "; + errorPrefix = "Adaptive Pooling layer with name '" + getName() + "' "; } else { - OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } if (one_of(op->get_type_info(), ov::op::v8::AdaptiveAvgPool::get_type_info_static())) { algorithm = Algorithm::AdaptivePoolingAvg; @@ -104,14 +108,14 @@ void AdaptivePooling::initSupportedPrimitiveDescriptors() { // we supports only fp32 currently precision = ov::element::f32; - std::vector dataFormats{ LayoutType::ncsp }; - const auto &inDims = getInputShapeAtPort(0).getDims(); + std::vector dataFormats{LayoutType::ncsp}; + const auto& inDims = getInputShapeAtPort(0).getDims(); if (inDims[1] != Shape::UNDEFINED_DIM && inDims[1] != 1) { dataFormats.push_back(LayoutType::nspc); dataFormats.push_back(LayoutType::nCsp16c); dataFormats.push_back(LayoutType::nCsp8c); } - for (const auto &df : dataFormats) { + for (const auto& df : dataFormats) { if (algorithm == Algorithm::AdaptivePoolingAvg) { addSupportedPrimDesc({{df, precision}, {LayoutType::ncsp, ov::element::i32}}, {{df, precision}}, @@ -134,9 +138,9 @@ void AdaptivePooling::execute(dnnl::stream strm) { if (!(inputPrec == dnnl_f32 && outputPrec == dnnl_f32)) OPENVINO_THROW(errorPrefix, "doesn't support demanded precisions"); - auto &srcMemory0 = getParentEdgeAt(0)->getMemory(); - auto &srcMemory1 = getParentEdgeAt(1)->getMemory(); - int *indexDst = nullptr; + auto& srcMemory0 = getParentEdgeAt(0)->getMemory(); + auto& srcMemory1 = getParentEdgeAt(1)->getMemory(); + int* indexDst = nullptr; if (algorithm == Algorithm::AdaptivePoolingMax) { indexDst = getDstDataAtPortAs(1); @@ -144,14 +148,15 @@ void AdaptivePooling::execute(dnnl::stream strm) { auto isPlainFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::ncsp); auto isTailCFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::nspc); - auto isBlkFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp16c) || srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp8c); + auto isBlkFmt = srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp16c) || + srcMemory0.getDesc().hasLayoutType(LayoutType::nCsp8c); auto srcBlockDesc = srcMemory0.getDescWithType(); int blockSize = isBlkFmt ? srcBlockDesc->getBlockDims().back() : 1; - const auto *src = getSrcDataAtPortAs(0); - const auto *srcPooledSpatialShapes = getSrcDataAtPortAs(1); - auto *dst = getDstDataAtPortAs(0); + const auto* src = getSrcDataAtPortAs(0); + const auto* srcPooledSpatialShapes = getSrcDataAtPortAs(1); + auto* dst = getDstDataAtPortAs(0); if (static_cast(srcMemory1.getShape().getElementsCount()) != spatialDimsCount) OPENVINO_THROW(errorPrefix, @@ -175,8 +180,9 @@ void AdaptivePooling::execute(dnnl::stream strm) { const int iHW = IH * IW; const int oDHW = OD * OH * OW, oHW = OH * OW; - const int chPadding = blockSize * (isBlkFmt ? srcBlockDesc->getBlockDims()[1] : srcMemory0.getShape().getStaticDims()[1]); - const int blockCount = (isTailCFmt ? 1 : chPadding / blockSize); + const int chPadding = + blockSize * (isBlkFmt ? srcBlockDesc->getBlockDims()[1] : srcMemory0.getShape().getStaticDims()[1]); + const int blockCount = (isTailCFmt ? 1 : chPadding / blockSize); auto selectedPrimitiveDescriptor = getSelectedPrimitiveDescriptor(); if (!selectedPrimitiveDescriptor) OPENVINO_THROW(errorPrefix, "doesn't have primitive descriptors."); @@ -186,27 +192,26 @@ void AdaptivePooling::execute(dnnl::stream strm) { // unified strides array const size_t tailDimsOffset = (isTailCFmt ? -1 : 0); - const size_t inStrides[5] = { - srcStrides[0], - (isTailCFmt ? 1 : srcStrides[1]), - (spatialDimsCount == 3 ? srcStrides[2 + tailDimsOffset] : 0), - (spatialDimsCount >= 2 ? srcStrides[spatialDimsCount + tailDimsOffset] : 0), - srcStrides[spatialDimsCount + 1 + tailDimsOffset] }; - const size_t outStrides[5] = { - dstStrides[0], - (isTailCFmt ? 1 : dstStrides[1]), - (spatialDimsCount == 3 ? dstStrides[2 + tailDimsOffset] : 0), - (spatialDimsCount >= 2 ? dstStrides[spatialDimsCount + tailDimsOffset] : 0), - dstStrides[spatialDimsCount + 1 + tailDimsOffset] }; - - std::function pool; - auto poolMax = [&] (const float *srcData, float *dstData, int od, int oh, int ow, size_t spatIndOff) { + const size_t inStrides[5] = {srcStrides[0], + (isTailCFmt ? 1 : srcStrides[1]), + (spatialDimsCount == 3 ? srcStrides[2 + tailDimsOffset] : 0), + (spatialDimsCount >= 2 ? srcStrides[spatialDimsCount + tailDimsOffset] : 0), + srcStrides[spatialDimsCount + 1 + tailDimsOffset]}; + const size_t outStrides[5] = {dstStrides[0], + (isTailCFmt ? 1 : dstStrides[1]), + (spatialDimsCount == 3 ? dstStrides[2 + tailDimsOffset] : 0), + (spatialDimsCount >= 2 ? dstStrides[spatialDimsCount + tailDimsOffset] : 0), + dstStrides[spatialDimsCount + 1 + tailDimsOffset]}; + + std::function pool; + auto poolMax = [&](const float* srcData, float* dstData, int od, int oh, int ow, size_t spatIndOff) { size_t dStart, dEnd, hStart, hEnd, wStart, wEnd; setBinBorders(&dStart, &dEnd, od, ID, OD); setBinBorders(&hStart, &hEnd, oh, IH, OH); setBinBorders(&wStart, &wEnd, ow, IW, OW); - float res = srcData[dStart * inStrides[2] + hStart * inStrides[3] + wStart * inStrides[4]]; // initial max value - int resIndex = dStart * iHW + hStart * IW + wStart; // initial max index + float res = + srcData[dStart * inStrides[2] + hStart * inStrides[3] + wStart * inStrides[4]]; // initial max value + int resIndex = dStart * iHW + hStart * IW + wStart; // initial max index for (size_t pixD = dStart; pixD < dEnd; pixD++) { for (size_t pixH = hStart; pixH < hEnd; pixH++) { for (size_t pixW = wStart; pixW < wEnd; pixW++) { @@ -219,7 +224,7 @@ void AdaptivePooling::execute(dnnl::stream strm) { *dstData = res; indexDst[spatIndOff * oDHW + od * oHW + oh * OW + ow] = resIndex; }; - auto poolAvg = [&] (const float *srcData, float *dstData, int od, int oh, int ow, size_t spatIndOff) { + auto poolAvg = [&](const float* srcData, float* dstData, int od, int oh, int ow, size_t spatIndOff) { size_t dStart, dEnd, hStart, hEnd, wStart, wEnd; setBinBorders(&dStart, &dEnd, od, ID, OD); setBinBorders(&hStart, &hEnd, oh, IH, OH); @@ -245,11 +250,10 @@ void AdaptivePooling::execute(dnnl::stream strm) { pool = poolAvg; } - parallel_for5d(N, blockCount, OD, OH, OW, - [&](int n, int blkIdx, int od, int oh, int ow) { + parallel_for5d(N, blockCount, OD, OH, OW, [&](int n, int blkIdx, int od, int oh, int ow) { auto srcData = src + n * inStrides[0] + blkIdx * inStrides[1]; - auto dstData = dst + n * outStrides[0] + blkIdx * outStrides[1] + - od * outStrides[2] + oh * outStrides[3] + ow * outStrides[4]; + auto dstData = dst + n * outStrides[0] + blkIdx * outStrides[1] + od * outStrides[2] + oh * outStrides[3] + + ow * outStrides[4]; int cStart = 0, cEnd = C, inResidual = 0, outResidual = 0; if (!isTailCFmt) { cStart = blkIdx * blockSize; @@ -263,18 +267,23 @@ void AdaptivePooling::execute(dnnl::stream strm) { inResidual = outResidual = c % blockSize; } pool(srcData + inResidual, dstData + outResidual, od, oh, ow, n * C + c); - }}); + } + }); } bool AdaptivePooling::created() const { return getType() == Type::AdaptivePooling; } -inline void AdaptivePooling::setBinBorders(size_t *startPtr, size_t *endPtr, size_t idx, size_t inputLength, size_t outputLength) { +inline void AdaptivePooling::setBinBorders(size_t* startPtr, + size_t* endPtr, + size_t idx, + size_t inputLength, + size_t outputLength) { *(startPtr) = idx * inputLength / outputLength; *(endPtr) = ceil(static_cast((idx + 1) * inputLength) / outputLength); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h index c88c9b5989aef9..04b628a5da5cee 100644 --- a/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h +++ b/src/plugins/intel_cpu/src/nodes/adaptive_pooling.h @@ -5,9 +5,11 @@ #pragma once #include -#include + #include +#include #include + #include "dnnl_extension_utils.h" namespace ov { @@ -29,16 +31,18 @@ class AdaptivePooling : public Node { int spatialDimsCount; mutable std::vector spatialDimsValue = {}; ov::element::Type precision = ov::element::f32; - inline void setBinBorders(size_t *startPtr, size_t *endPtr, size_t idx, size_t inputLength, size_t outputLength); + inline void setBinBorders(size_t* startPtr, size_t* endPtr, size_t idx, size_t inputLength, size_t outputLength); std::string errorPrefix; protected: bool needShapeInfer() const override; - bool needPrepareParams() const override { return false; }; + bool needPrepareParams() const override { + return false; + }; void executeDynamicImpl(dnnl::stream strm) override; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp b/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp index 80713e90750e2d..50665c083ec930 100644 --- a/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp +++ b/src/plugins/intel_cpu/src/nodes/batch_to_space.cpp @@ -2,14 +2,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "batch_to_space.h" + +#include #include +#include + #include "dnnl_types.h" +#include "nodes/common/blocked_desc_creator.h" #include "openvino/core/parallel.hpp" #include "selective_build.h" -#include "batch_to_space.h" -#include "nodes/common/blocked_desc_creator.h" -#include namespace ov { namespace intel_cpu { @@ -40,8 +42,8 @@ BatchToSpace::BatchToSpace(const std::shared_ptr& op, const GraphConte if (inputShapes.size() != 4 || outputShapes.size() != 1) OPENVINO_THROW(errorPrefix, " has incorrect number of input or output edges!"); - const auto &inDims = getInputShapeAtPort(0).getDims(); - const auto &outDims = getOutputShapeAtPort(0).getDims(); + const auto& inDims = getInputShapeAtPort(0).getDims(); + const auto& outDims = getOutputShapeAtPort(0).getDims(); if (inDims.size() < 4 || inDims.size() > 5) OPENVINO_THROW(errorPrefix, " has unsupported 'data' input rank: ", inDims.size()); if (inDims.size() != outDims.size()) @@ -52,7 +54,7 @@ void BatchToSpace::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const auto &inDims = getInputShapeAtPort(0).getDims(); + const auto& inDims = getInputShapeAtPort(0).getDims(); const auto precision = getOriginalInputPrecisionAtPort(0); const std::set supported_precision_sizes = {1, 2, 4, 8}; if (supported_precision_sizes.find(precision.size()) == supported_precision_sizes.end()) @@ -88,7 +90,7 @@ void BatchToSpace::initSupportedPrimitiveDescriptors() { } } -static std::vector getShape5D(const VectorDims &shape) { +static std::vector getShape5D(const VectorDims& shape) { std::vector shape5D(5, 1); for (int i = 0; i < 2; i++) { shape5D[i] = shape[i]; @@ -98,26 +100,26 @@ static std::vector getShape5D(const VectorDims &shape) { return shape5D; } -template +template void BatchToSpace::batchToSpaceKernel() { - const auto *srcData = getSrcDataAtPortAs(0); - const auto *blockShapesPtr = getSrcDataAtPortAs(1); + const auto* srcData = getSrcDataAtPortAs(0); + const auto* blockShapesPtr = getSrcDataAtPortAs(1); size_t dataRank = getSrcMemoryAtPort(0)->getShape().getRank(); blockShapeIn.clear(); for (size_t i = 0; i < dataRank; i++) { blockShapeIn.push_back(*(blockShapesPtr + i)); } - const auto *padsBeginPtr = getSrcDataAtPortAs(2); + const auto* padsBeginPtr = getSrcDataAtPortAs(2); cropsBeginIn.clear(); for (size_t i = 0; i < dataRank; i++) { cropsBeginIn.push_back(*(padsBeginPtr + i)); } - auto *dstData = getDstDataAtPortAs(0); + auto* dstData = getDstDataAtPortAs(0); - const auto &inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); - const auto &outDims = getChildEdgeAt(0)->getMemory().getStaticDims(); + const auto& inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); + const auto& outDims = getChildEdgeAt(0)->getMemory().getStaticDims(); auto srcDesc = getParentEdgeAt(0)->getMemory().getDescWithType(); @@ -193,8 +195,8 @@ void BatchToSpace::batchToSpaceKernel() { const int64_t addTmpOC = blocked ? 0lu : oAdd[1]; const int64_t addTmpOc = blocked ? oAdd[1] : 0lu; - const size_t firstI1 = i0 == 0 ? std::max(begin[1], indxStart[1]) : begin[1]; - const size_t lastI1 = i0 == indxEnd[0] ? std::min(indxEnd[1] + 1, finish[1]) : finish[1]; + const size_t firstI1 = i0 == 0 ? std::max(begin[1], indxStart[1]) : begin[1]; + const size_t lastI1 = i0 == indxEnd[0] ? std::min(indxEnd[1] + 1, finish[1]) : finish[1]; for (size_t i1 = firstI1; i1 < lastI1; ++i1) { const size_t block = i1 == finish[1] ? lastBlock : blockSize; @@ -216,12 +218,13 @@ void BatchToSpace::batchToSpaceKernel() { const size_t dstIdx4 = dstIdx3 + tmpOw * blockSize; for (size_t it = 0; it < itEnd + 1; ++it) { const size_t i5Begin = it == 0 ? 0 : (it * blockSize - 1 - oAdd[1]) / blockShape[1] + 1; - const size_t i5End = it == itEnd ? (block - 1) : ((it + 1) * blockSize - 1 - oAdd[1]) / blockShape[1]; + const size_t i5End = + it == itEnd ? (block - 1) : ((it + 1) * blockSize - 1 - oAdd[1]) / blockShape[1]; for (size_t i5 = i5Begin; i5 < i5End + 1; ++i5) { const int64_t tmpOc = i5 * blockShape[1] + addTmpOc; const size_t srcIdx5 = srcIdx4 + i5; const size_t dstIdx5 = - dstIdx4 + it * outSpatialStep * blockSize + (tmpOc - it * blockSize); + dstIdx4 + it * outSpatialStep * blockSize + (tmpOc - it * blockSize); dstData[dstIdx5] = srcData[srcIdx5]; } } @@ -239,13 +242,19 @@ void BatchToSpace::executeDynamicImpl(dnnl::stream strm) { void BatchToSpace::execute(dnnl::stream strm) { switch (getParentEdgeAt(0)->getMemory().getDesc().getPrecision().size()) { - case 1: batchToSpaceKernel::value_type>(); break; - case 2: batchToSpaceKernel::value_type>(); break; - case 4: batchToSpaceKernel::value_type>(); break; - default: - OPENVINO_THROW("BatchToSpace layer does not support precision '", - std::string(getParentEdgeAt(0)->getMemory().getDesc().getPrecision().get_type_name()), - "'"); + case 1: + batchToSpaceKernel::value_type>(); + break; + case 2: + batchToSpaceKernel::value_type>(); + break; + case 4: + batchToSpaceKernel::value_type>(); + break; + default: + OPENVINO_THROW("BatchToSpace layer does not support precision '", + std::string(getParentEdgeAt(0)->getMemory().getDesc().getPrecision().get_type_name()), + "'"); } } @@ -253,6 +262,6 @@ bool BatchToSpace::created() const { return getType() == Type::BatchToSpace; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/batch_to_space.h b/src/plugins/intel_cpu/src/nodes/batch_to_space.h index 1b583f74bd7905..5211e0c0b5dd10 100644 --- a/src/plugins/intel_cpu/src/nodes/batch_to_space.h +++ b/src/plugins/intel_cpu/src/nodes/batch_to_space.h @@ -14,7 +14,7 @@ class BatchToSpace : public Node { public: BatchToSpace(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; // output shape can potentially be empty @@ -25,14 +25,18 @@ class BatchToSpace : public Node { void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override { return false; }; - bool needShapeInfer() const override {return true;}; + bool needPrepareParams() const override { + return false; + }; + bool needShapeInfer() const override { + return true; + }; void executeDynamicImpl(dnnl::stream strm) override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - template + template void batchToSpaceKernel(); private: @@ -42,6 +46,6 @@ class BatchToSpace : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp index d1e82235ba9bb1..336a370374a9f9 100644 --- a/src/plugins/intel_cpu/src/nodes/bin_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/bin_conv.cpp @@ -3,34 +3,35 @@ // #include "bin_conv.h" -#include "eltwise.h" -#include "fake_quantize.h" -#include "conv.h" + #include #include #include -#include "dnnl_types.h" + +#include "conv.h" +#include "cpu/x64/cpu_isa_traits.hpp" +#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp" +#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" +#include "cpu/x64/jit_generator.hpp" #include "dnnl_extension_utils.h" +#include "dnnl_types.h" +#include "eltwise.h" +#include "fake_quantize.h" #include "openvino/core/parallel.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" -#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" -#include "utils/general_utils.h" #include "openvino/opsets/opset1.hpp" +#include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" // WA for xbyak.h #ifdef _WIN32 -# ifndef _WINSOCKAPI_ -# define _WINSOCKAPI_ -# endif -# ifndef _WINSOCK2API_ -# define _WINSOCK2API_ -# endif +# ifndef _WINSOCKAPI_ +# define _WINSOCKAPI_ +# endif +# ifndef _WINSOCK2API_ +# define _WINSOCK2API_ +# endif #endif - using namespace dnnl; using namespace dnnl::impl; using namespace dnnl::impl::cpu; @@ -42,14 +43,17 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_bin_conv_call_args, field) +# define GET_OFF(field) offsetof(jit_bin_conv_call_args, field) template struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_bin_conv_kernel_f32) - explicit jit_uni_bin_conv_kernel_f32(jit_bin_conv_params jcp, jit_dw_conv_params jcp_dw_conv, const dnnl_primitive_attr &attr) : - jit_uni_bin_conv_kernel(jcp, jcp_dw_conv, attr), jit_generator(jit_name()) {} + explicit jit_uni_bin_conv_kernel_f32(jit_bin_conv_params jcp, + jit_dw_conv_params jcp_dw_conv, + const dnnl_primitive_attr& attr) + : jit_uni_bin_conv_kernel(jcp, jcp_dw_conv, attr), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -57,16 +61,19 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } void generate() override { - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; int end_idx = jcp_.with_dw_conv ? p.find(primitive_kind::convolution) : p.len(); for (int i = 0; i < end_idx; i++) { - auto &post_op = p.entry_[i]; + auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { - eltwise_injectors.push_back(std::make_shared>( - this, post_op.eltwise, true, eltwise_reserved, mask_post_op_reserved)); + eltwise_injectors.push_back(std::make_shared>(this, + post_op.eltwise, + true, + eltwise_reserved, + mask_post_op_reserved)); } else if (post_op.is_depthwise()) { - depthwise_injectors.push_back(std::make_shared>( - this, post_op, mask_post_op_reserved)); + depthwise_injectors.push_back( + std::make_shared>(this, post_op, mask_post_op_reserved)); } } @@ -80,7 +87,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ mov(reg_oc_work, ptr[this->param1 + GET_OFF(oc_work)]); mov(reg_post_ops_data, ptr[this->param1 + GET_OFF(post_op_data)]); - mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); + mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]); mov(reg_table, l_table); Label main_loop_label; @@ -98,14 +105,16 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ int nbits = 8; - L(main_loop_label); { + L(main_loop_label); + { cmp(reg_oc_work, jcp_.oc_block); jl(tail_label, T_NEAR); solve_common(1, jcp_.oc_block); sub(reg_oc_work, jcp_.oc_block); - add(reg_kernel_base, jcp_.oc_block * jcp_.nb_ic * jcp_.kh * jcp_.kw * div_up(jcp_.ic_block, nbits) * jcp_.typesize_in); + add(reg_kernel_base, + jcp_.oc_block * jcp_.nb_ic * jcp_.kh * jcp_.kw * div_up(jcp_.ic_block, nbits) * jcp_.typesize_in); if (jcp_.with_dw_conv) { add(reg_output_base, jcp_.oc_block * jcp_dw_conv_.kh * jcp_.ow * jcp_.typesize_out); @@ -137,8 +146,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } private: - using Vmm = typename conditional3::type; + using Vmm = typename conditional3::type; using Ymm = const Xbyak::Ymm; using reg8_t = const Xbyak::Reg8; @@ -212,100 +220,108 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ nstl::vector>> eltwise_injectors; nstl::vector>> depthwise_injectors; - void cvt2ps(dnnl::memory::data_type type_in, Vmm vmm_in, const Xbyak::Operand &op, bool scalar_load) { + void cvt2ps(dnnl::memory::data_type type_in, Vmm vmm_in, const Xbyak::Operand& op, bool scalar_load) { Xmm xmm_in = Xmm(vmm_in.getIdx()); switch (type_in) { - case memory::data_type::f32: - case memory::data_type::s32: - if (scalar_load) { - mov(reg_tmp_32, op); - uni_vmovq(xmm_in, reg_tmp_64); - } else { - uni_vmovups(vmm_in, op); - } - break; - case memory::data_type::s8: - if (scalar_load) { - movsx(reg_tmp_32, op); - uni_vmovq(xmm_in, reg_tmp_64); - } else { - uni_vpmovsxbd(vmm_in, op); - } - break; - case memory::data_type::u8: - if (scalar_load) { - movzx(reg_tmp_32, op); - uni_vmovq(xmm_in, reg_tmp_64); - } else { - uni_vpmovzxbd(vmm_in, op); - } - break; - default: assert(!"unsupported data type"); + case memory::data_type::f32: + case memory::data_type::s32: + if (scalar_load) { + mov(reg_tmp_32, op); + uni_vmovq(xmm_in, reg_tmp_64); + } else { + uni_vmovups(vmm_in, op); + } + break; + case memory::data_type::s8: + if (scalar_load) { + movsx(reg_tmp_32, op); + uni_vmovq(xmm_in, reg_tmp_64); + } else { + uni_vpmovsxbd(vmm_in, op); + } + break; + case memory::data_type::u8: + if (scalar_load) { + movzx(reg_tmp_32, op); + uni_vmovq(xmm_in, reg_tmp_64); + } else { + uni_vpmovzxbd(vmm_in, op); + } + break; + default: + assert(!"unsupported data type"); } if (type_in != data_type::f32) uni_vcvtdq2ps(vmm_in, vmm_in); } - void store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) { + void store_dst(const Xbyak::Address& op, Vmm vmm_dst, bool scalar_store) { Ymm ymm_dst = Ymm(vmm_dst.getIdx()); Xmm xmm_dst = Xmm(vmm_dst.getIdx()); switch (jcp_.dst_dt) { - case memory::data_type::f32: - case memory::data_type::s32: - if (scalar_store) { - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_32); - } else { - uni_vmovups(op, vmm_dst); - } - break; - case memory::data_type::s8: - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + case memory::data_type::f32: + case memory::data_type::s32: + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_32); + } else { + uni_vmovups(op, vmm_dst); + } + break; + case memory::data_type::s8: + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41 && !scalar_store) - vpermq(ymm_dst, ymm_dst, 0x08); + if (isa != x64::sse41 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - if (scalar_store) { - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - } else { - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); - } - break; - case memory::data_type::u8: - case memory::data_type::bin: - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case memory::data_type::u8: + case memory::data_type::bin: + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41 && !scalar_store) - vpermq(ymm_dst, ymm_dst, 0x08); + if (isa != x64::sse41 && !scalar_store) + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - if (scalar_store) { - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - } else { - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); - } + if (scalar_store) { + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + } else { + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } - break; - default: - assert(!"unknown dst_dt"); + break; + default: + assert(!"unknown dst_dt"); } } - void apply_filter(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step, int ic_blocks, bool last_icb, bool h_padded) { + void apply_filter(int ur_w, + int pad_l, + int pad_r, + int oc_blocks, + int oc_step, + int ic_blocks, + bool last_icb, + bool h_padded) { int kw = jcp_.kw; int kh = jcp_.kh; int stride_w = jcp_.stride_w; @@ -318,15 +334,16 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int ki = 0; ki < kw; ki++) { int jj_start = nstl::max(0, div_up(pad_l - ki * dilate_w, stride_w)); - int jj_end = ur_w - nstl::max(0, div_up(ki*dilate_w+pad_r-(kw-1)*dilate_w, stride_w)); + int jj_end = ur_w - nstl::max(0, div_up(ki * dilate_w + pad_r - (kw - 1) * dilate_w, stride_w)); int _start = (!jcp_.exclude_pad) ? 0 : jj_start; int _end = (!jcp_.exclude_pad) ? ur_w : jj_end; for (int ifm2 = 0; ifm2 < ic_blocks; ifm2++) { for (int jj = _start; jj < _end; jj++) { - int inp_off = ((ki*dilate_w + jj*stride_w - pad_l)*div_up(jcp_.ic, nbits) + - ifm2 * div_up(ic_blk, nbits)) * jcp_.typesize_in; + int inp_off = ((ki * dilate_w + jj * stride_w - pad_l) * div_up(jcp_.ic, nbits) + + ifm2 * div_up(ic_blk, nbits)) * + jcp_.typesize_in; if (h_padded || jj < jj_start || jj >= jj_end) { uni_vmovups(vmm_src, ptr[reg_table + 8 * vlen]); @@ -336,10 +353,11 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int r = 0; r < repeats; r++) { for (int ii = 0; ii < oc_blocks; ii++) { - int ker_off = (ifm2 * kh * kw * div_up(ic_blk, nbits) * oc_blk - + ii * jcp_.nb_ic * div_up(ic_blk, nbits) * kh * kw * oc_blk - + ki * div_up(ic_blk, nbits) * oc_blk - + r * div_up(ic_blk, nbits) * (oc_blk / 2)) * jcp_.typesize_in; + int ker_off = + (ifm2 * kh * kw * div_up(ic_blk, nbits) * oc_blk + + ii * jcp_.nb_ic * div_up(ic_blk, nbits) * kh * kw * oc_blk + + ki * div_up(ic_blk, nbits) * oc_blk + r * div_up(ic_blk, nbits) * (oc_blk / 2)) * + jcp_.typesize_in; uni_vmovups(vmm_tmp, ptr[aux1_reg_kernel + ker_off]); @@ -350,7 +368,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (mayiuse(x64::avx512_vpopcnt)) { vpopcntd(vmm_tmp, vmm_tmp); uni_vpaddd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), - Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_tmp); + Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_tmp); } else { if (isa == x64::sse41) { movups(vmm_tmp1, vmm_tmp); @@ -375,12 +394,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } if (mayiuse(avx512_core_vnni)) { - vpdpbusd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_tmp, vmm_one_u8); + vpdpbusd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_tmp, + vmm_one_u8); } else { uni_vpmaddubsw(vmm_tmp, vmm_tmp, vmm_one_u8); uni_vpmaddwd(vmm_tmp, vmm_tmp, vmm_one_s16); uni_vpaddd(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), - Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_tmp); + Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_tmp); } } } @@ -431,22 +453,22 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ int nbits = 8; const int inp_mult = dilate_h * div_up(jcp_.ic, nbits); - Label t_overflow_label, no_t_overflow_label, - b_overflow_label, no_b_overflow_label; + Label t_overflow_label, no_t_overflow_label, b_overflow_label, no_b_overflow_label; mov(aux_reg_input, reg_input); mov(aux_reg_kernel, reg_kernel_base); - uni_vmovups(vmm_lookup, ptr[reg_table + 0 * vlen]); - uni_vmovups(vmm_mask, ptr[reg_table + 1 * vlen]); - uni_vmovups(vmm_one_u8, ptr[reg_table + 5 * vlen]); + uni_vmovups(vmm_lookup, ptr[reg_table + 0 * vlen]); + uni_vmovups(vmm_mask, ptr[reg_table + 1 * vlen]); + uni_vmovups(vmm_one_u8, ptr[reg_table + 5 * vlen]); uni_vmovups(vmm_one_s16, ptr[reg_table + 6 * vlen]); if (!jcp_.exclude_pad) { - mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); + mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]); cmp(reg_overflow, 0); je(no_t_overflow_label, T_NEAR); - L(t_overflow_label); { + L(t_overflow_label); + { oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true); add(aux_reg_kernel, jcp_.typesize_in * kw * jcp_.oc_block * div_up(jcp_.ic_block, nbits)); @@ -459,8 +481,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ Label skip_kh_loop; mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]); - if (!jcp_.exclude_pad || (jcp_.exclude_pad && - (jcp_.kh - 1) * (jcp_.dilate_h + 1) < nstl::max(jcp_.t_pad, jcp_.b_pad))) { + if (!jcp_.exclude_pad || + (jcp_.exclude_pad && (jcp_.kh - 1) * (jcp_.dilate_h + 1) < nstl::max(jcp_.t_pad, jcp_.b_pad))) { cmp(reg_kh, 0); je(skip_kh_loop, T_NEAR); } @@ -481,10 +503,11 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ L(skip_kh_loop); if (!jcp_.exclude_pad) { - mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); + mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]); cmp(reg_overflow, 0); je(no_b_overflow_label, T_NEAR); - L(b_overflow_label); { + L(b_overflow_label); + { oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true); add(aux_reg_kernel, jcp_.typesize_in * kw * jcp_.oc_block * div_up(jcp_.ic_block, nbits)); @@ -515,7 +538,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ kmovw(ktail_mask, reg_tmp_32); } - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; for (int r = 0; r < repeats; r++) { int tail_size = isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; bool is_scalar_store = isa == x64::sse41 ? tail_size < jcp_.oc_block / 2 : tail_size < jcp_.oc_block; @@ -524,15 +547,17 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (jcp_.exclude_pad) { mov(reg_tmp_32, jcp_.ic); - imul(reg_tmp_32, ptr[param1 + GET_OFF(kh_padding)]); + imul(reg_tmp_32, ptr[param1 + GET_OFF(kh_padding)]); for (int jj = 0; jj < ur_w; jj++) kw_padding[jj] = 0; for (int ki = 0; ki < jcp_.kw; ki++) { int jj_start = nstl::max(0, div_up(pad_l - ki * (jcp_.dilate_w + 1), jcp_.stride_w)); - int jj_end = ur_w - nstl::max(0, div_up(ki * (jcp_.dilate_w + 1) + pad_r - - (jcp_.kw - 1) * (jcp_.dilate_w + 1), jcp_.stride_w)); + int jj_end = + ur_w - nstl::max(0, + div_up(ki * (jcp_.dilate_w + 1) + pad_r - (jcp_.kw - 1) * (jcp_.dilate_w + 1), + jcp_.stride_w)); for (int jj = jj_start; jj < jj_end; jj++) { kw_padding[jj]++; } @@ -552,8 +577,11 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } for (int ii = 0; ii < oc_blocks; ii++) { - uni_vcvtdq2ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj)); - uni_vfmadd213ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), vmm_scale, vmm_shift); + uni_vcvtdq2ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj)); + uni_vfmadd213ps(Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj), + vmm_scale, + vmm_shift); } } @@ -580,7 +608,9 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int ii = 0; ii < oc_blocks; ii++) { depthwise_injectors[depthwise_inj_idx]->compute_vector_range(start_idx + ur_w * ii, - start_idx + ur_w * ii + ur_w, reg_d_weights, reg_d_weights); + start_idx + ur_w * ii + ur_w, + reg_d_weights, + reg_d_weights); add(reg_d_weights, jcp_.oc_block * sizeof(float)); } @@ -596,7 +626,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (is_scalar_store) { if (isa == x64::avx512_core) { - int o_off = jj * jcp_.oc * jcp_.ngroups; + int o_off = jj * jcp_.oc * jcp_.ngroups; Vmm vmm_in = vmm_sum | ktail_mask | T_z; @@ -604,7 +634,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ uni_vaddps(vmm_dst, vmm_dst, vmm_sum); } else { for (int oc = 0; oc < tail_size; oc++) { - int o_off = jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2) + oc; + int o_off = jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2) + oc; uni_vpxor(vmm_sum, vmm_sum, vmm_sum); cvt2ps(jcp_.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp_.typesize_out], true); @@ -621,7 +651,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } } } else { - size_t o_off = ii * jcp_.oc_block + jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2); + size_t o_off = + ii * jcp_.oc_block + jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2); cvt2ps(jcp_.dst_dt, vmm_sum, ptr[reg_output + o_off * jcp_.typesize_out], false); uni_vaddps(vmm_dst, vmm_dst, vmm_sum); @@ -649,10 +680,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ for (int ii = 0; ii < oc_blocks; ii++) { for (int jj = 0; jj < ur_w; jj++) { for (int r = 0; r < repeats; r++) { - int tail_size = isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; + int tail_size = + isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; mov(reg_b_mask, (1 << tail_size) - 1); - uni_vmovups(vmm_thr, ptr[reg_b_weights + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); - uni_vmovups(vmm_out_mask, ptr[reg_b_out_mask + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); + uni_vmovups( + vmm_thr, + ptr[reg_b_weights + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); + uni_vmovups( + vmm_out_mask, + ptr[reg_b_out_mask + (ii * jcp_.oc_block + r * (jcp_.oc_block / 2)) * sizeof(float)]); Vmm vmm_dst = Vmm(1 + r * jcp_.ur_w * jcp_.nb_oc_blocking + ur_w * ii + jj); @@ -693,7 +729,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } } else { for (int r = 0; r < repeats; r++) { - int tail_size = isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; + int tail_size = + isa == x64::sse41 ? nstl::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; bool is_scalar_store = isa == x64::sse41 ? tail_size < jcp_.oc_block / 2 : tail_size < jcp_.oc_block; if (is_scalar_store) { for (int jj = 0; jj < ur_w; jj++) { @@ -735,7 +772,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ size_t o_off; if (jcp_.with_dw_conv) - o_off = ((size_t) ii * jcp_dw_conv_.kh * jcp_.ow + jj) * jcp_.oc_block + + o_off = ((size_t)ii * jcp_dw_conv_.kh * jcp_.ow + jj) * jcp_.oc_block + r * (jcp_.oc_block / 2); else o_off = ii * jcp_.oc_block + jj * jcp_.oc * jcp_.ngroups + r * (jcp_.oc_block / 2); @@ -759,14 +796,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ int nbits = 8; const int inp_mult = div_up(jcp_.ic, nbits); - const int out_mult = jcp_.with_dw_conv ? jcp_.oc_block : jcp_.with_binarization ? div_up(jcp_.oc, nbits) : jcp_.oc; + const int out_mult = jcp_.with_dw_conv ? jcp_.oc_block + : jcp_.with_binarization ? div_up(jcp_.oc, nbits) + : jcp_.oc; int l_pad = jcp_.l_pad; - int r_pad = nstl::max(0, (jcp_.ow - 1) * str_w + (kw - 1) * dilate_w - - (iw + l_pad - 1)); - int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w - - (iw + l_pad - 1); - if (r_pad1 > 0) n_oi--; + int r_pad = nstl::max(0, (jcp_.ow - 1) * str_w + (kw - 1) * dilate_w - (iw + l_pad - 1)); + int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w - (iw + l_pad - 1); + if (r_pad1 > 0) + n_oi--; mov(reg_input, reg_input_base); mov(reg_output, reg_output_base); @@ -779,9 +817,9 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (l_pad > 0) { n_oi--; if (n_oi < 0 && r_pad1 > 0) - width_blk_step(ur_w, l_pad, r_pad1, oc_blocks, oc_step); // "lrpad" + width_blk_step(ur_w, l_pad, r_pad1, oc_blocks, oc_step); // "lrpad" else - width_blk_step(ur_w, l_pad, 0, oc_blocks, oc_step); // "lpad" + width_blk_step(ur_w, l_pad, 0, oc_blocks, oc_step); // "lpad" add(reg_input, jcp_.typesize_in * (ur_w * str_w - l_pad) * inp_mult); add(reg_output, jcp_.typesize_out * ur_w * out_mult); } @@ -792,7 +830,7 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ if (n_oi > 0) { L(ow_loop_label); - width_blk_step(ur_w, 0, 0, oc_blocks, oc_step); // "middle" + width_blk_step(ur_w, 0, 0, oc_blocks, oc_step); // "middle" add(reg_input, jcp_.typesize_in * ur_w * str_w * inp_mult); add(reg_output, jcp_.typesize_out * ur_w * out_mult); @@ -801,14 +839,14 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ jl(ow_loop_label, T_NEAR); } - if (r_pad1 > 0 && n_oi >=0) { - width_blk_step(ur_w, 0, r_pad1, oc_blocks, oc_step); // "rpad" + if (r_pad1 > 0 && n_oi >= 0) { + width_blk_step(ur_w, 0, r_pad1, oc_blocks, oc_step); // "rpad" add(reg_input, jcp_.typesize_in * ur_w * str_w * inp_mult); add(reg_output, jcp_.typesize_out * ur_w * out_mult); } if (ur_w_tail != 0) - width_blk_step(ur_w_tail, 0, r_pad, oc_blocks, oc_step); // "tail" + width_blk_step(ur_w_tail, 0, r_pad, oc_blocks, oc_step); // "tail" pop(reg_oc_off); pop(reg_oc_work); @@ -817,17 +855,15 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } void prepare_table() { - const unsigned int cvals[] = { - 0x02010100, // 0 1 1 2 - 0x03020201, // 1 2 2 3 - 0x03020201, // 1 2 2 3 - 0x04030302, // 2 3 3 4 - 0x0f0f0f0f, - 0x000000ff, - 0xc0000000, // -2.0f - 0x01010101, - 0x00010001 - }; + const unsigned int cvals[] = {0x02010100, // 0 1 1 2 + 0x03020201, // 1 2 2 3 + 0x03020201, // 1 2 2 3 + 0x04030302, // 2 3 3 4 + 0x0f0f0f0f, + 0x000000ff, + 0xc0000000, // -2.0f + 0x01010101, + 0x00010001}; size_t simd_w = vlen / sizeof(int32_t); @@ -876,7 +912,8 @@ struct jit_uni_bin_conv_kernel_f32 : public jit_uni_bin_conv_kernel, public jit_ } }; #endif -bool BinaryConvolution::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool BinaryConvolution::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (isDynamicNgraphNode(op)) { errorMessage = "Doesn't support op with dynamic shapes"; @@ -934,7 +971,7 @@ void BinaryConvolution::getSupportedDescriptors() { withSum = false; size_t expectedInputEdgesNum = 2; for (size_t i = 0; i < fusedWith.size(); i++) { - auto *eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { withSum = true; expectedInputEdgesNum++; @@ -979,22 +1016,30 @@ void BinaryConvolution::initSupportedPrimitiveDescriptors() { if (implType != impl_desc_type::ref) { // optimzed implementation -// auto weiFormat = implType == impl_desc_type::jit_avx512 ? memory::format_tag::OhIw16o32i : memory::format_tag::OhIw8o32i; + // auto weiFormat = implType == impl_desc_type::jit_avx512 ? memory::format_tag::OhIw16o32i : + // memory::format_tag::OhIw8o32i; - //activation + // activation auto nspcCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::nspc); config.inConfs[0].setMemDesc(nspcCreator->createSharedDesc(ov::element::u1, getInputShapeAtPort(0))); - //weights - size_t weiFirstDimBlockSize = implType == impl_desc_type::jit_avx512 ? 16 : 8; //memory::format_tag::OIhw16o32i : memory::format_tag::OIhw8o32i; + // weights + size_t weiFirstDimBlockSize = implType == impl_desc_type::jit_avx512 + ? 16 + : 8; // memory::format_tag::OIhw16o32i : memory::format_tag::OIhw8o32i; auto weiDims = getInputShapeAtPort(1).getStaticDims(); - std::vector weiBlockDims = {div_up(weiDims[0], weiFirstDimBlockSize), div_up(weiDims[1], 32), - weiDims[2], weiDims[3], weiFirstDimBlockSize, 32}; + std::vector weiBlockDims = {div_up(weiDims[0], weiFirstDimBlockSize), + div_up(weiDims[1], 32), + weiDims[2], + weiDims[3], + weiFirstDimBlockSize, + 32}; std::vector weiOrder = {0, 1, 2, 3, 0, 1}; - config.inConfs[1].setMemDesc(std::make_shared(ov::element::u1, Shape(weiDims), weiBlockDims, weiOrder)); + config.inConfs[1].setMemDesc( + std::make_shared(ov::element::u1, Shape(weiDims), weiBlockDims, weiOrder)); - //result + // result auto outputPrecision = withBinarization ? ov::element::u1 : ov::element::f32; config.outConfs[0].setMemDesc(nspcCreator->createSharedDesc(outputPrecision, getOutputShapeAtPort(0))); if (withSum) { @@ -1056,14 +1101,15 @@ void BinaryConvolution::createPrimitive() { jcp.with_dw_conv = false; jcp.with_binarization = withBinarization; - const auto &p = (*attr.get()).post_ops_; + const auto& p = (*attr.get()).post_ops_; jcp.with_sum = p.find(primitive_kind::sum) != -1; jcp.with_binarization = p.find(primitive_kind::binarization) != -1; int simd_w = implType == impl_desc_type::jit_avx512 ? 16 : 8; jcp.ur_w = implType == impl_desc_type::jit_avx512 ? 4 : 2; - if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow; + if (jcp.ow < jcp.ur_w) + jcp.ur_w = jcp.ow; jcp.ur_w_tail = jcp.ow % jcp.ur_w; jcp.ic_block = 32; @@ -1073,7 +1119,10 @@ void BinaryConvolution::createPrimitive() { jcp.oc_block = simd_w; jcp.nb_oc = div_up(jcp.oc, jcp.oc_block); - jcp.nb_oc_blocking = nstl::min(implType == impl_desc_type::jit_sse42 ? 2 : implType == impl_desc_type::jit_avx2 ? 4 : 6, jcp.nb_oc); + jcp.nb_oc_blocking = nstl::min(implType == impl_desc_type::jit_sse42 ? 2 + : implType == impl_desc_type::jit_avx2 ? 4 + : 6, + jcp.nb_oc); auto srcPrecision = getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); auto dstPrecision = getChildEdgeAt(0)->getMemory().getDesc().getPrecision(); @@ -1082,11 +1131,13 @@ void BinaryConvolution::createPrimitive() { jcp.typesize_in = srcPrecision == ov::element::u1 ? 1 : srcPrecision.size(); jcp.typesize_out = dstPrecision == ov::element::u1 ? 1 : dstPrecision.size(); - int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w - + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); + int r_pad_no_tail = nstl::max( + 0, + (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1)); - bool args_ok = (jcp.l_pad <= jcp.ur_w) && (r_pad_no_tail <= jcp.ur_w) && - IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) || (jcp.stride_w == 1 && jcp.stride_h == 1)); + bool args_ok = + (jcp.l_pad <= jcp.ur_w) && (r_pad_no_tail <= jcp.ur_w) && + IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0) || (jcp.stride_w == 1 && jcp.stride_h == 1)); if (!args_ok) OPENVINO_THROW("BinaryConvolution with name '", getName(), "' has unsupported parameters"); #if defined(OPENVINO_ARCH_X86_64) @@ -1122,12 +1173,12 @@ bool BinaryConvolution::canFuse(const NodePtr& node) const { } } -void BinaryConvolution::setPostOps(dnnl::primitive_attr &attr) { +void BinaryConvolution::setPostOps(dnnl::primitive_attr& attr) { dnnl::post_ops ops; postOpsDataPtrs.clear(); - for (auto &node : fusedWith) { - auto* eltwiseNode = dynamic_cast(node.get()); + for (auto& node : fusedWith) { + auto* eltwiseNode = dynamic_cast(node.get()); if (eltwiseNode) { if (eltwiseNode->isSpecialConvolutionAddFusing()) { ops.append_sum(1.0); @@ -1138,7 +1189,7 @@ void BinaryConvolution::setPostOps(dnnl::primitive_attr &attr) { continue; } - auto* fakeQuantizeNode = dynamic_cast(node.get()); + auto* fakeQuantizeNode = dynamic_cast(node.get()); if (fakeQuantizeNode) { fakeQuantizeNode->appendPostOps(ops, getOutputShapeAtPort(0).getStaticDims(), postOpsDataPtrs); continue; @@ -1154,9 +1205,13 @@ void BinaryConvolution::setPostOps(dnnl::primitive_attr &attr) { attr.set_post_ops(ops); } -void BinaryConvolution::executeOptimized(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) { - auto dst_f32 = reinterpret_cast(dst); +void BinaryConvolution::executeOptimized(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str) { + auto dst_f32 = reinterpret_cast(dst); const int MB = jcp.mb; @@ -1170,26 +1225,28 @@ void BinaryConvolution::executeOptimized(const uint8_t* src, const uint8_t* weig auto par_conv = jit_bin_conv_call_args(); const int ij = oh * jcp.stride_h; - const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h+1))); - const int i_b_overflow = nstl::min(jcp.kh, div_up(nstl::max(jcp.ih, ij + (jcp.kh-1) * (jcp.dilate_h+1) - - jcp.t_pad+1) - jcp.ih, (jcp.dilate_h + 1))); + const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h + 1))); + const int i_b_overflow = + nstl::min(jcp.kh, + div_up(nstl::max(jcp.ih, ij + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih, + (jcp.dilate_h + 1))); const size_t _oc = g * jcp.nb_oc + ocb; const size_t _ic = g * jcp.nb_ic; const int ih = nstl::max(ij - jcp.t_pad + i_t_overflow * (jcp.dilate_h + 1), 0); - par_conv.src = &src[(n * s_str[0] + _ic*jcp.ic_block * s_str[1] + ih * s_str[2]) / nbits]; + par_conv.src = &src[(n * s_str[0] + _ic * jcp.ic_block * s_str[1] + ih * s_str[2]) / nbits]; if (jcp.with_binarization) { - par_conv.dst = &dst[(n * d_str[0] + _oc*jcp.oc_block * d_str[1] + oh * d_str[2]) / nbits]; + par_conv.dst = &dst[(n * d_str[0] + _oc * jcp.oc_block * d_str[1] + oh * d_str[2]) / nbits]; } else { - par_conv.dst = &dst_f32[n * d_str[0] + _oc*jcp.oc_block * d_str[1] + oh * d_str[2]]; + par_conv.dst = &dst_f32[n * d_str[0] + _oc * jcp.oc_block * d_str[1] + oh * d_str[2]]; } const int wh = jcp.exclude_pad ? i_t_overflow : 0; par_conv.filt = &weights[(ocb * w_str[0] + wh * w_str[2]) / nbits]; - par_conv.oc_work = nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb*jcp.oc_block; + par_conv.oc_work = nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb * jcp.oc_block; par_conv.kw_padding = 0; const int kh_padding = jcp.kh - i_t_overflow - i_b_overflow; @@ -1204,9 +1261,13 @@ void BinaryConvolution::executeOptimized(const uint8_t* src, const uint8_t* weig }); } -void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str) { - auto dst_fp = reinterpret_cast(dst); +void BinaryConvolution::executeReference(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str) { + auto dst_fp = reinterpret_cast(dst); const bool with_groups = jcp.ngroups > 1; @@ -1240,7 +1301,7 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig return (uint8_t)((val >> bit) & 0x0001); }; - auto ker = [=](int32_t &d, int g, int mb, int oc, int oh, int ow) { + auto ker = [=](int32_t& d, int g, int mb, int oc, int oh, int ow) { for (int ic = 0; ic < IC; ++ic) { for (int kh = 0; kh < KH; ++kh) { for (int kw = 0; kw < KW; ++kw) { @@ -1259,14 +1320,14 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig if (pad_value == 0) continue; else - s = pad_value == 1.0f ? (uint8_t) 1 : (uint8_t) 0; + s = pad_value == 1.0f ? (uint8_t)1 : (uint8_t)0; } else { - s = extract_bit(src[iidx / nbits], (uint8_t) (iidx % nbits)); + s = extract_bit(src[iidx / nbits], (uint8_t)(iidx % nbits)); } - uint8_t w = extract_bit(weights[widx / nbits], (uint8_t) (widx % nbits)); + uint8_t w = extract_bit(weights[widx / nbits], (uint8_t)(widx % nbits)); - d += (int32_t) (s ^ w); + d += (int32_t)(s ^ w); } } } @@ -1280,13 +1341,11 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig if (pad_value == 0.0f) { const int i_left_overflow = nstl::max(0, (padL - ow * KSW)); const int i_right_overflow = nstl::max(IW, (ow * KSW + (KW - 1) * (KDW + 1) - padL + 1)) - IW; - const int kw_padding = - KW - div_up(i_left_overflow, (KDW + 1)) - div_up(i_right_overflow, (KDW + 1)); + const int kw_padding = KW - div_up(i_left_overflow, (KDW + 1)) - div_up(i_right_overflow, (KDW + 1)); const int i_top_overflow = nstl::max(0, (padT - oh * KSH)); const int i_bottom_overflow = nstl::max(IH, (oh * KSH + (KH - 1) * (KDH + 1) - padT + 1)) - IH; - const int kh_padding = - KH - div_up(i_top_overflow, (KDH + 1)) - div_up(i_bottom_overflow, (KDH + 1)); + const int kh_padding = KH - div_up(i_top_overflow, (KDH + 1)) - div_up(i_bottom_overflow, (KDH + 1)); base_value = IC * kh_padding * kw_padding; } else { @@ -1295,7 +1354,7 @@ void BinaryConvolution::executeReference(const uint8_t* src, const uint8_t* weig float a_fp = base_value - static_cast(2 * a); - dst_fp[mb * d_str[0] + (g*OC + oc) * d_str[1] + oh * d_str[2] + ow * d_str[3]] = a_fp; + dst_fp[mb * d_str[0] + (g * OC + oc) * d_str[1] + oh * d_str[2] + ow * d_str[3]] = a_fp; }); } @@ -1342,6 +1401,6 @@ bool BinaryConvolution::created() const { return getType() == Type::BinaryConvolution; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bin_conv.h b/src/plugins/intel_cpu/src/nodes/bin_conv.h index 86b5cb41b2bf6d..661e075b680ec7 100644 --- a/src/plugins/intel_cpu/src/nodes/bin_conv.h +++ b/src/plugins/intel_cpu/src/nodes/bin_conv.h @@ -39,9 +39,9 @@ struct jit_dw_conv_params { }; struct jit_bin_conv_call_args { - const void *src; - const void *dst; - const void *filt; + const void* src; + const void* dst; + const void* filt; size_t kh_padding; size_t kw_padding; size_t oc_work; @@ -52,15 +52,20 @@ struct jit_bin_conv_call_args { }; struct jit_uni_bin_conv_kernel { - void (*ker_)(const jit_bin_conv_call_args *); + void (*ker_)(const jit_bin_conv_call_args*); - void operator()(const jit_bin_conv_call_args *args) { + void operator()(const jit_bin_conv_call_args* args) { assert(ker_); ker_(args); } - explicit jit_uni_bin_conv_kernel(jit_bin_conv_params jcp, jit_dw_conv_params jcp_dw_conv, const dnnl_primitive_attr &attr) : - ker_(nullptr), jcp_(jcp), jcp_dw_conv_(jcp_dw_conv), attr_(attr) {} + explicit jit_uni_bin_conv_kernel(jit_bin_conv_params jcp, + jit_dw_conv_params jcp_dw_conv, + const dnnl_primitive_attr& attr) + : ker_(nullptr), + jcp_(jcp), + jcp_dw_conv_(jcp_dw_conv), + attr_(attr) {} virtual ~jit_uni_bin_conv_kernel() {} virtual void create_ker() = 0; @@ -68,7 +73,7 @@ struct jit_uni_bin_conv_kernel { jit_bin_conv_params jcp_; jit_dw_conv_params jcp_dw_conv_; - const dnnl_primitive_attr &attr_; + const dnnl_primitive_attr& attr_; }; class BinaryConvolution : public Node { @@ -83,12 +88,14 @@ class BinaryConvolution : public Node { bool canBeInPlace() const override { return false; } - void setPostOps(dnnl::primitive_attr &attr); + void setPostOps(dnnl::primitive_attr& attr); bool canFuse(const NodePtr& node) const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - impl_desc_type getImplType() { return implType; } + impl_desc_type getImplType() { + return implType; + } private: bool withSum = false; @@ -110,14 +117,22 @@ class BinaryConvolution : public Node { impl_desc_type implType = impl_desc_type::ref; - void executeOptimized(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str); - void executeReference(const uint8_t* src, const uint8_t* weights, uint8_t* dst, - const std::vector& s_str, const std::vector& w_str, const std::vector& d_str); + void executeOptimized(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str); + void executeReference(const uint8_t* src, + const uint8_t* weights, + uint8_t* dst, + const std::vector& s_str, + const std::vector& w_str, + const std::vector& d_str); std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.cpp b/src/plugins/intel_cpu/src/nodes/broadcast.cpp index c88803e07de601..646e186922b397 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.cpp +++ b/src/plugins/intel_cpu/src/nodes/broadcast.cpp @@ -2,15 +2,18 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "broadcast.h" + +#include + #include +#include + +#include "common/cpu_memcpy.h" #include "dnnl_types.h" -#include "openvino/core/parallel.hpp" -#include -#include "broadcast.h" #include "nodes/common/blocked_desc_creator.h" +#include "openvino/core/parallel.hpp" #include "openvino/opsets/opset1.hpp" -#include "common/cpu_memcpy.h" #include "utils/ngraph_utils.hpp" namespace ov { @@ -24,19 +27,20 @@ bool Broadcast::isSupportedOperation(const std::shared_ptr& op, return false; } if (!one_of(ov::as_type_ptr(op)->get_broadcast_spec().m_type, - ov::op::AutoBroadcastType::NUMPY, ov::op::AutoBroadcastType::EXPLICIT)) { + ov::op::AutoBroadcastType::NUMPY, + ov::op::AutoBroadcastType::EXPLICIT)) { errorMessage = "Only NUMPY and EXPLICIT broadcast types are supported."; return false; } if (op->get_input_partial_shape(TARGET_SHAPE_IDX).is_dynamic() || - (op->get_input_size() > AXES_MAPPING_IDX && op->get_input_partial_shape(AXES_MAPPING_IDX).is_dynamic())) { + (op->get_input_size() > AXES_MAPPING_IDX && op->get_input_partial_shape(AXES_MAPPING_IDX).is_dynamic())) { errorMessage = "Only static shapes are supported for target shape and axes mapping inputs."; return false; } if (!isDynamicNgraphNode(op) && - (!ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)) || - (op->get_input_size() > AXES_MAPPING_IDX && - !ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))))) { + (!ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)) || + (op->get_input_size() > AXES_MAPPING_IDX && + !ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))))) { errorMessage = "Only constant target shapes and axis mapping inputs are supported for static shapes."; return false; } @@ -72,12 +76,13 @@ Broadcast::Broadcast(const std::shared_ptr& op, const GraphContext::CP if (ov::is_type(op->get_input_node_ptr(TARGET_SHAPE_IDX))) { constMap[TARGET_SHAPE_IDX] = true; - targetShape = (ov::as_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)))->get_vector(); + targetShape = + (ov::as_type(op->get_input_node_ptr(TARGET_SHAPE_IDX)))->get_vector(); } - if (broadcastType == EXPLICIT && - ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))) { + if (broadcastType == EXPLICIT && ov::is_type(op->get_input_node_ptr(AXES_MAPPING_IDX))) { constMap[AXES_MAPPING_IDX] = true; - axesMapping = ov::as_type(op->get_input_node_ptr(AXES_MAPPING_IDX))->get_vector(); + axesMapping = + ov::as_type(op->get_input_node_ptr(AXES_MAPPING_IDX))->get_vector(); } } @@ -126,7 +131,8 @@ void Broadcast::prepareParams() { repeats.assign(targetShape.begin(), targetShape.end()); const auto ndims = repeats.size(); - auto srcBlockedDims = getParentEdgeAt(INPUT_DATA_IDX)->getMemory().getDescWithType()->getBlockDims(); + auto srcBlockedDims = + getParentEdgeAt(INPUT_DATA_IDX)->getMemory().getDescWithType()->getBlockDims(); auto dstBlockedDims = getChildEdgeAt(0)->getMemory().getDescWithType()->getBlockDims(); if (broadcastType == NUMPY) { @@ -227,8 +233,8 @@ void Broadcast::plainExecute(dnnl::stream strm) { } const size_t workAmountDst = dstStrides[0] * dstDims[0]; - const auto *srcData = getSrcDataAtPortAs(INPUT_DATA_IDX); - auto *dstData = getDstDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(INPUT_DATA_IDX); + auto* dstData = getDstDataAtPortAs(0); parallel_nt(0, [&](const int ithr, const int nthr) { size_t i = 0lu, srcIdx = 0lu, start = 0lu, end = 0lu; @@ -246,7 +252,8 @@ void Broadcast::plainExecute(dnnl::stream strm) { for (int j = dataDstRank - 1; j >= 0; j--) { counters[j] = (counters[j] + 1) % dstDims[j]; - if (counters[j] != 0) break; + if (counters[j] != 0) + break; } } }); @@ -256,6 +263,6 @@ bool Broadcast::created() const { return getType() == Type::Broadcast; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/broadcast.h b/src/plugins/intel_cpu/src/nodes/broadcast.h index 1435314ee08776..df9ad4614e311d 100644 --- a/src/plugins/intel_cpu/src/nodes/broadcast.h +++ b/src/plugins/intel_cpu/src/nodes/broadcast.h @@ -4,12 +4,12 @@ #pragma once -#include "common/tile_broadcast_utils.h" - #include #include #include +#include "common/tile_broadcast_utils.h" + namespace ov { namespace intel_cpu { namespace node { @@ -35,10 +35,7 @@ class Broadcast : public Node, public TileBroadcastCommon { private: void plainExecute(dnnl::stream strm); - enum AutoBroadcastType { - NUMPY, - EXPLICIT - }; + enum AutoBroadcastType { NUMPY, EXPLICIT }; AutoBroadcastType broadcastType = NUMPY; static constexpr size_t INPUT_DATA_IDX = 0; @@ -51,6 +48,6 @@ class Broadcast : public Node, public TileBroadcastCommon { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.cpp b/src/plugins/intel_cpu/src/nodes/bucketize.cpp index a71255c0d531e4..cfa4bb031501ef 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.cpp +++ b/src/plugins/intel_cpu/src/nodes/bucketize.cpp @@ -2,14 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "bucketize.h" + +#include +#include #include #include -#include -#include "openvino/opsets/opset3.hpp" -#include #include "openvino/core/parallel.hpp" -#include "bucketize.h" +#include "openvino/opsets/opset3.hpp" namespace ov { namespace intel_cpu { @@ -70,16 +71,15 @@ void Bucketize::initSupportedPrimitiveDescriptors() { output_precision = ov::element::i32; } - addSupportedPrimDesc({{LayoutType::ncsp, input_precision}, - {LayoutType::ncsp, boundaries_precision}}, + addSupportedPrimDesc({{LayoutType::ncsp, input_precision}, {LayoutType::ncsp, boundaries_precision}}, {{LayoutType::ncsp, output_precision}}, impl_desc_type::ref_any); } inline constexpr uint32_t getElementsMask(ov::element::Type precision1, - ov::element::Type precision2, - ov::element::Type precision3 = ov::element::undefined, - ov::element::Type precision4 = ov::element::undefined) { + ov::element::Type precision2, + ov::element::Type precision3 = ov::element::undefined, + ov::element::Type precision4 = ov::element::undefined) { return static_cast(ov::element::Type_t(precision1)) | (static_cast(ov::element::Type_t(precision2)) << 8) | (static_cast(ov::element::Type_t(precision3)) << 16) | @@ -90,98 +90,98 @@ void Bucketize::execute(dnnl::stream strm) { auto precision_mask = getElementsMask(input_precision, boundaries_precision, output_precision); switch (precision_mask) { - case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i32): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i64): - bucketize::value_type, - element_type_traits::value_type, - element_type_traits::value_type>(); - break; - default: - OPENVINO_THROW(errorPrefix, " has unsupported precision: ", precision_mask); + case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::f32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::f32, ov::element::i64, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::f32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i32, ov::element::i64, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::f32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i32, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i32): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + case getElementsMask(ov::element::i64, ov::element::i64, ov::element::i64): + bucketize::value_type, + element_type_traits::value_type, + element_type_traits::value_type>(); + break; + default: + OPENVINO_THROW(errorPrefix, " has unsupported precision: ", precision_mask); } } @@ -222,9 +222,9 @@ bool Bucketize::isExecutable() const { template void Bucketize::bucketize() { - const auto *input_data = getSrcDataAtPortAs(0); - const auto *boundaries_data = getSrcDataAtPortAs(1); - auto *output_data = getDstDataAtPortAs(0); + const auto* input_data = getSrcDataAtPortAs(0); + const auto* boundaries_data = getSrcDataAtPortAs(1); + auto* output_data = getDstDataAtPortAs(0); if (!with_bins) { memset(output_data, 0, num_values * sizeof(T_IND)); @@ -248,6 +248,6 @@ bool Bucketize::created() const { return getType() == Type::Bucketize; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/bucketize.h b/src/plugins/intel_cpu/src/nodes/bucketize.h index c834921a38ce54..0ecdd633838950 100644 --- a/src/plugins/intel_cpu/src/nodes/bucketize.h +++ b/src/plugins/intel_cpu/src/nodes/bucketize.h @@ -14,7 +14,7 @@ class Bucketize : public Node { public: Bucketize(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -46,6 +46,6 @@ class Bucketize : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp index 674d77265c9219..fd015a372ed1db 100644 --- a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp +++ b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.cpp @@ -4,16 +4,16 @@ #include "causal_mask_preprocess.h" +#include +#include +#include + #include "common/bfloat16.hpp" #include "common/cpu_memcpy.h" #include "cpu/x64/cpu_isa_traits.hpp" #include "shape_inference/shape_inference_internal_dyn.hpp" #include "utils/plain_tensor.hpp" -#include -#include -#include - namespace ov { namespace intel_cpu { namespace node { @@ -48,7 +48,7 @@ The functionality is equivalent to following python code: template struct CausalMaskPreprocess::ExecutorCausalMaskPreprocess : public CausalMaskPreprocess::Executor { void execute(dnnl::stream strm, - intel_cpu::Node * pnode, + intel_cpu::Node* pnode, const intel_cpu::CausalMaskPreprocessNode::Config& config) override { ov::intel_cpu::PlainTensor t_attention_mask(pnode->getSrcMemoryAtPort(0)); ov::intel_cpu::PlainTensor t_batch_size(pnode->getSrcMemoryAtPort(1)); @@ -64,7 +64,14 @@ struct CausalMaskPreprocess::ExecutorCausalMaskPreprocess : public CausalMaskPre pnode->redefineOutputMemory({newDims}); ov::intel_cpu::PlainTensor t_dst(pnode->getDstMemoryAtPort(0)); - DEBUG_LOG("CausalMaskPreprocess::execute", config.type, " batch_size=", batch_size, " qLen=", qLen, " kvLen=", kvLen); + DEBUG_LOG("CausalMaskPreprocess::execute", + config.type, + " batch_size=", + batch_size, + " qLen=", + qLen, + " kvLen=", + kvLen); DEBUG_LOG("CausalMaskPreprocess::execute attention_mask=", t_attention_mask); DEBUG_LOG("CausalMaskPreprocess::execute cache_positions=", t_cache_positions); @@ -81,7 +88,7 @@ struct CausalMaskPreprocess::ExecutorCausalMaskPreprocess : public CausalMaskPre bool cmask_eq0 = (j <= row); bool amask_eq0 = (pamask[j] == 0); bool padding_mask = (cmask_eq0 && amask_eq0); - pdst[j] = (padding_mask | (!cmask_eq0))? min_dtype : T(0); + pdst[j] = (padding_mask | (!cmask_eq0)) ? min_dtype : T(0); } for (; j < kvLen; j++) { bool cmask_eq0 = (j <= row); @@ -103,7 +110,8 @@ CausalMaskPreprocess::CausalMaskPreprocess(const std::shared_ptr& op, m_config = node->get_config(); } -bool CausalMaskPreprocess::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool CausalMaskPreprocess::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto node = std::dynamic_pointer_cast(op); if (!node) { @@ -133,7 +141,8 @@ void CausalMaskPreprocess::initSupportedPrimitiveDescriptors() { oprecs[0] = ov::element::f32; } // all input precisions must be int32 - for (auto& prec : iprecs) prec = ov::element::i32; + for (auto& prec : iprecs) + prec = ov::element::i32; } else { OPENVINO_THROW("CPU: CausalMaskPreprocess type not supported : " + m_config.type); } diff --git a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h index eeb997c4cefb9f..444f242b0597a7 100644 --- a/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h +++ b/src/plugins/intel_cpu/src/nodes/causal_mask_preprocess.h @@ -32,7 +32,7 @@ class CausalMaskPreprocess : public Node { private: struct Executor { virtual void execute(dnnl::stream strm, - intel_cpu::Node * pnode, + intel_cpu::Node* pnode, const intel_cpu::CausalMaskPreprocessNode::Config& config) = 0; virtual ~Executor() = default; }; diff --git a/src/plugins/intel_cpu/src/nodes/col2im.cpp b/src/plugins/intel_cpu/src/nodes/col2im.cpp index 4b83e78fd82505..409607ea6bb89c 100644 --- a/src/plugins/intel_cpu/src/nodes/col2im.cpp +++ b/src/plugins/intel_cpu/src/nodes/col2im.cpp @@ -3,8 +3,9 @@ // #include "col2im.h" -#include "openvino/reference/col2im.hpp" + #include "openvino/op/col2im.hpp" +#include "openvino/reference/col2im.hpp" namespace ov { namespace intel_cpu { @@ -62,42 +63,42 @@ void Col2Im::executeDynamicImpl(dnnl::stream strm) { template void Col2Im::executeImpl() { - ov::reference::col2im( - getSrcDataAtPortAs(0), - ov::Shape{getSrcMemoryAtPort(0)->getStaticDims()}, - getSrcDataAtPortAs(1), - getSrcDataAtPortAs(2), - getDstDataAtPortAs(0), - strides, - dilations, - padsBegin, - padsEnd); + ov::reference::col2im(getSrcDataAtPortAs(0), + ov::Shape{getSrcMemoryAtPort(0)->getStaticDims()}, + getSrcDataAtPortAs(1), + getSrcDataAtPortAs(2), + getDstDataAtPortAs(0), + strides, + dilations, + padsBegin, + padsEnd); } namespace { struct Col2ImContext { - Col2Im &node; + Col2Im& node; }; -} +} // namespace -template +template struct Col2Im::Col2ImExecute { using TData = typename std::tuple_element<0, T>::type; using TIndex = typename std::tuple_element<1, T>::type; - void operator()(Col2ImContext & ctx) { - ctx.node.executeImpl(); - } + void operator()(Col2ImContext& ctx) { + ctx.node.executeImpl(); + } }; void Col2Im::execute(dnnl::stream strm) { auto dataPrecision = getParentEdgeAt(0)->getMemory().getDesc().getPrecision(); auto indexPrecision = getParentEdgeAt(1)->getMemory().getDesc().getPrecision(); - Col2ImContext ctx = { - *this - }; + Col2ImContext ctx = {*this}; - OV_SWITCH(intel_cpu, Col2ImExecute, ctx, std::tie(dataPrecision, indexPrecision), + OV_SWITCH(intel_cpu, + Col2ImExecute, + ctx, + std::tie(dataPrecision, indexPrecision), OV_CASE2(ov::element::f32, ov::element::i32, float, int32_t), OV_CASE2(ov::element::f16, ov::element::i32, ov::float16, int32_t), OV_CASE2(ov::element::bf16, ov::element::i32, ov::bfloat16, int32_t), diff --git a/src/plugins/intel_cpu/src/nodes/col2im.h b/src/plugins/intel_cpu/src/nodes/col2im.h index 9904689e53be0f..b56b4bb78469aa 100644 --- a/src/plugins/intel_cpu/src/nodes/col2im.h +++ b/src/plugins/intel_cpu/src/nodes/col2im.h @@ -26,7 +26,7 @@ class Col2Im : public Node { template void executeImpl(); - template + template struct Col2ImExecute; ov::Strides strides; diff --git a/src/plugins/intel_cpu/src/nodes/color_convert.cpp b/src/plugins/intel_cpu/src/nodes/color_convert.cpp index ea3c8e2c774944..a06214b768d6b4 100644 --- a/src/plugins/intel_cpu/src/nodes/color_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/color_convert.cpp @@ -3,14 +3,17 @@ // #include "color_convert.h" + #include -#include -#include -#include -#include + #include -#include "openvino/core/parallel.hpp" +#include +#include +#include +#include + #include "kernels/x64/jit_kernel.hpp" +#include "openvino/core/parallel.hpp" #include "shape_inference/custom/color_convert.hpp" using namespace dnnl::impl; @@ -39,7 +42,7 @@ class Converter : public ColorConvert::Converter { using Base = ColorConvert::Converter; public: - Converter(Node *node); + Converter(Node* node); bool singlePlane() const; @@ -47,12 +50,12 @@ class Converter : public ColorConvert::Converter { std::tuple yuv_to_rgb(float y, float u, float v); }; -Converter::Converter(Node *node) - : Base(node, node->getAlgorithm() == Algorithm::ColorConvertNV12toRGB - || node->getAlgorithm() == Algorithm::ColorConvertI420toRGB - ? ColorFormat { { 0, 1, 2 } } - : ColorFormat { { 2, 1, 0 } }) { -} +Converter::Converter(Node* node) + : Base(node, + node->getAlgorithm() == Algorithm::ColorConvertNV12toRGB || + node->getAlgorithm() == Algorithm::ColorConvertI420toRGB + ? ColorFormat{{0, 1, 2}} + : ColorFormat{{2, 1, 0}}) {} bool Converter::singlePlane() const { return _node->getOriginalInputsNumber() == 1; @@ -81,46 +84,43 @@ struct jit_uni_converter : public jit_kernel { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_converter) struct Params { - const void * y; - const void * u; - const void * v; - void * dst; + const void* y; + const void* u; + const void* v; + void* dst; size_t width; - uint8_t colorFormat; // RGB: 0, BGR: !=0 + uint8_t colorFormat; // RGB: 0, BGR: !=0 }; - typedef void (*function_t)(const Params *); + typedef void (*function_t)(const Params*); void init(); - void operator()(const Params & args) const { + void operator()(const Params& args) const { _fn(&args); } protected: jit_uni_converter(); - template - void yuv_to_rgb(const variable & y, - const variable & u, - const variable & v, - const variable & color_format, + template + void yuv_to_rgb(const variable& y, + const variable& u, + const variable& v, + const variable& color_format, bool round); - template - void store_tail(const variable & dst, - const variable & a, - const variable & b, - const variable & c, - const variable & size); + template + void store_tail(const variable& dst, + const variable& a, + const variable& b, + const variable& c, + const variable& size); function_t _fn; variable _consts; }; -jit_uni_converter::jit_uni_converter() - : jit_kernel(jit_name()), - _consts(*this) { -} +jit_uni_converter::jit_uni_converter() : jit_kernel(jit_name()), _consts(*this) {} void jit_uni_converter::init() { if (create_kernel() != status::success) @@ -128,15 +128,13 @@ void jit_uni_converter::init() { _fn = (function_t)jit_ker(); } -template -void jit_uni_converter::yuv_to_rgb(const variable & y, - const variable & u, - const variable & v, - const variable & color_format, +template +void jit_uni_converter::yuv_to_rgb(const variable& y, + const variable& u, + const variable& v, + const variable& color_format, bool round) { - auto clip = [&](const variable & op, - const variable & a, - const variable & b) { + auto clip = [&](const variable& op, const variable& a, const variable& b) { if (round) uni_vroundps(op, op, 0); uni_vmaxps(op, op, a); @@ -144,8 +142,12 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, }; // blend r,g,b and put to r0,r1,r2 - auto blend = [&](const variable & r, const variable & g, const variable & b, - const variable & r0, const variable & r1, const variable & r2) { + auto blend = [&](const variable& r, + const variable& g, + const variable& b, + const variable& r0, + const variable& r1, + const variable& r2) { /* Input: r0,r1,r2,r3,r4,r5,r6,r7 @@ -174,7 +176,7 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, */ auto genPermutationMask = [&](int offset) { - std::array mask {}; + std::array mask{}; for (uint8_t i = 0; i < mask.size(); ++i) mask[(i * 3 + offset) % mask.size()] = i; return mask; @@ -184,11 +186,8 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, g.permute(genPermutationMask(1)); b.permute(genPermutationMask(2)); - auto blendWithMask = [&](int offset, const variable & result) { - static const uint32_t blendMasks[2] = { - 0x92492492, - 0x24924924 - }; + auto blendWithMask = [&](int offset, const variable& result) { + static const uint32_t blendMasks[2] = {0x92492492, 0x24924924}; const uint16_t mask0 = static_cast(blendMasks[0] >> ((offset * N) % 3)); const uint16_t mask1 = static_cast(blendMasks[1] >> ((offset * N) % 3)); @@ -208,29 +207,29 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, auto b = var(); auto tmp = var(); - uni_vbroadcastss(tmp, ptr[_consts + 0 * sizeof(float)]); // tmp = [16.0f,16.0f,...] - uni_vsubps(y, y, tmp); // y = y - tmp - uni_vbroadcastss(tmp, ptr[_consts + 1 * sizeof(float)]); // tmp = [128.f,128.f,...] - uni_vsubps(u, u, tmp); // u = u - tmp - uni_vsubps(v, v, tmp); // v = v - tmp + uni_vbroadcastss(tmp, ptr[_consts + 0 * sizeof(float)]); // tmp = [16.0f,16.0f,...] + uni_vsubps(y, y, tmp); // y = y - tmp + uni_vbroadcastss(tmp, ptr[_consts + 1 * sizeof(float)]); // tmp = [128.f,128.f,...] + uni_vsubps(u, u, tmp); // u = u - tmp + uni_vsubps(v, v, tmp); // v = v - tmp - uni_vbroadcastss(tmp, ptr[_consts + 2 * sizeof(float)]); // tmp = [1.164f,1.164f,...] - uni_vmulps(y, y, tmp); // y = y * tmp + uni_vbroadcastss(tmp, ptr[_consts + 2 * sizeof(float)]); // tmp = [1.164f,1.164f,...] + uni_vmulps(y, y, tmp); // y = y * tmp - uni_vbroadcastss(r, ptr[_consts + 3 * sizeof(float)]); // r = [1.596f,1.596f,...] - uni_vmulps(r, r, v); // r = r * v - uni_vaddps(r, r, y); // r = r + y + uni_vbroadcastss(r, ptr[_consts + 3 * sizeof(float)]); // r = [1.596f,1.596f,...] + uni_vmulps(r, r, v); // r = r * v + uni_vaddps(r, r, y); // r = r + y - uni_vbroadcastss(g, ptr[_consts + 4 * sizeof(float)]); // g = [0.391f,0.391f,...] - uni_vmulps(g, g, u); // g = g * u - uni_vsubps(g, y, g); // g = y - g - uni_vbroadcastss(tmp, ptr[_consts + 6 * sizeof(float)]); // tmp = [0.813f,0.813f,...] - uni_vmulps(tmp, tmp, v); // tmp = tmp * v - uni_vsubps(g, g, tmp); // g = g - tmp + uni_vbroadcastss(g, ptr[_consts + 4 * sizeof(float)]); // g = [0.391f,0.391f,...] + uni_vmulps(g, g, u); // g = g * u + uni_vsubps(g, y, g); // g = y - g + uni_vbroadcastss(tmp, ptr[_consts + 6 * sizeof(float)]); // tmp = [0.813f,0.813f,...] + uni_vmulps(tmp, tmp, v); // tmp = tmp * v + uni_vsubps(g, g, tmp); // g = g - tmp - uni_vbroadcastss(b, ptr[_consts + 5 * sizeof(float)]); // b = [2.018f,2.018f,...] - uni_vmulps(b, b, u); // b = b * u - uni_vaddps(b, b, y); // b = b + y + uni_vbroadcastss(b, ptr[_consts + 5 * sizeof(float)]); // b = [2.018f,2.018f,...] + uni_vmulps(b, b, u); // b = b * u + uni_vaddps(b, b, y); // b = b + y // clip uni_vxorps(y, y, y); @@ -241,24 +240,30 @@ void jit_uni_converter::yuv_to_rgb(const variable & y, clip(b, y, u); _if(color_format == 0) - ._then([&]{ blend(r, g, b, y, u, v); }) - ._else([&]{ blend(b, g, r, y, u, v); }); + ._then([&] { + blend(r, g, b, y, u, v); + }) + ._else([&] { + blend(b, g, r, y, u, v); + }); } -template -void jit_uni_converter::store_tail(const variable & dst, - const variable & a, - const variable & b, - const variable & c, - const variable & size) { +template +void jit_uni_converter::store_tail(const variable& dst, + const variable& a, + const variable& b, + const variable& c, + const variable& size) { const size_t step = N * sizeof(T); auto s = stack(3 * step); auto sptr = var(); sptr = s.pointer(); - store(sptr, a); sptr += step; - store(sptr, b); sptr += step; + store(sptr, a); + sptr += step; + store(sptr, b); + sptr += step; store(sptr, c); auto copy_size = size * size_t(3u); @@ -269,36 +274,33 @@ void jit_uni_converter::store_tail(const variable & dst, namespace nv12 { -ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node *node) { - const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 +ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node* node) { + const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 - const ov::element::Type precision = node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 - ? ov::element::u8 - : ov::element::f32; + const ov::element::Type precision = + node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 ? ov::element::u8 : ov::element::f32; ColorConvert::Converter::PrimitiveDescs descs; - descs.emplace_back(std::vector { node->getOriginalInputsNumber(), { layout, precision } }, - std::vector { { layout, precision } }, - mayiuse(cpu_isa_t::sse41) - ? impl_desc_type::jit_uni - : impl_desc_type::ref, - true); + descs.emplace_back(std::vector{node->getOriginalInputsNumber(), {layout, precision}}, + std::vector{{layout, precision}}, + mayiuse(cpu_isa_t::sse41) ? impl_desc_type::jit_uni : impl_desc_type::ref, + true); return descs; } -template +template class SinglePlaneConvert; -template +template class TwoPlaneConvert; class RefConverter : public Converter { public: - RefConverter(Node *node); + RefConverter(Node* node); protected: - template + template void convert(const T* y, const T* uv, T* dst, @@ -309,15 +311,14 @@ class RefConverter : public Converter { size_t stride_uv); }; -RefConverter::RefConverter(Node *node) - : Converter(node) { +RefConverter::RefConverter(Node* node) : Converter(node) { if (node->getOriginalInputsNumber() != (singlePlane() ? 1 : 2)) OPENVINO_THROW("NV12Converter node has incorrect number of inputs"); if (!node->getOriginalOutputsNumber()) OPENVINO_THROW("NV12Converter node has incorrect number of outputs"); } -template +template void RefConverter::convert(const T* y, const T* uv, T* dst, @@ -346,13 +347,13 @@ void RefConverter::convert(const T* y, }); } -template +template class SinglePlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -362,22 +363,17 @@ class SinglePlaneConvert : public RefConverter { const T* uv = y + width * height; T* dst = static_cast(output(0)); - convert(y, uv, dst, - batch_size, - height, - width, - height * width * 3 / 2, - height * width * 3 / 2); + convert(y, uv, dst, batch_size, height, width, height * width * 3 / 2, height * width * 3 / 2); } }; -template +template class TwoPlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const T* y = static_cast(input(0)); const T* uv = static_cast(input(1)); @@ -387,34 +383,24 @@ class TwoPlaneConvert : public RefConverter { const size_t height = dims[H_DIM]; const size_t width = dims[W_DIM]; - convert(y, uv, dst, - batch_size, - height, - width, - height * width, - height * width / 2); + convert(y, uv, dst, batch_size, height, width, height * width, height * width / 2); } }; #if defined(OPENVINO_ARCH_X86_64) -template +template class JitConverter; -template +template class JitConverter : public jit_uni_converter { private: void generate() override; - std::tuple, - variable, - variable> - load_yuv(const variable & src_y, - const variable & src_uv); - std::tuple, - variable> - unpack_uv(const variable & uv); + std::tuple, variable, variable> load_yuv(const variable& src_y, + const variable& src_uv); + std::tuple, variable> unpack_uv(const variable& uv); }; -template +template void JitConverter::generate() { preamble(); @@ -425,7 +411,7 @@ void JitConverter::generate() { auto width = arg(&Params::width); auto colorFormat = arg(&Params::colorFormat); - static const float data[8] = { 16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f }; + static const float data[8] = {16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f}; _consts = data; const size_t reg_capacity_log = static_cast(std::logb(N)); @@ -433,26 +419,29 @@ void JitConverter::generate() { width >>= reg_capacity_log; - foreach(0, width, [&](const Reg64 & idx) { + foreach (0, width, [&](const Reg64& idx) { auto yuv = load_yuv(src_y, src_uv); // Aliases - const auto & y = std::get<0>(yuv); - const auto & u = std::get<1>(yuv); - const auto & v = std::get<2>(yuv); + const auto& y = std::get<0>(yuv); + const auto& u = std::get<1>(yuv); + const auto& v = std::get<2>(yuv); yuv_to_rgb(y, u, v, colorFormat, std::is_integral::value); - store(dst, y); dst += step; - store(dst, u); dst += step; - store(dst, v); dst += step; - }); + store(dst, y); + dst += step; + store(dst, u); + dst += step; + store(dst, v); + dst += step; + }) + ; mov(width, argPtr(&Params::width)); width &= N - 1; - _if(width != 0) - ._then([&] { + _if(width != 0)._then([&] { auto y = var(); auto uv = var(); @@ -462,8 +451,8 @@ void JitConverter::generate() { auto uv_pair = unpack_uv(uv); // Aliases - const auto & u = std::get<0>(uv_pair); - const auto & v = std::get<1>(uv_pair); + const auto& u = std::get<0>(uv_pair); + const auto& v = std::get<1>(uv_pair); yuv_to_rgb(y, u, v, colorFormat, std::is_integral::value); @@ -473,12 +462,9 @@ void JitConverter::generate() { postamble(); } -template -std::tuple, - jit_kernel::variable, - jit_kernel::variable> -JitConverter::load_yuv(const variable & src_y, - const variable & src_uv) { +template +std::tuple, jit_kernel::variable, jit_kernel::variable> +JitConverter::load_yuv(const variable& src_y, const variable& src_uv) { auto y = var(); auto uv = var(); @@ -490,29 +476,26 @@ JitConverter::load_yuv(const variable & src_y, src_y += N * sizeof(T); src_uv += N * sizeof(T); - return std::make_tuple(std::move(y), - std::move(std::get<0>(uv_pair)), - std::move(std::get<1>(uv_pair))); + return std::make_tuple(std::move(y), std::move(std::get<0>(uv_pair)), std::move(std::get<1>(uv_pair))); } -template -std::tuple, - jit_kernel::variable> -JitConverter::unpack_uv(const variable & uv) { +template +std::tuple, jit_kernel::variable> JitConverter::unpack_uv( + const variable& uv) { auto u = var(); auto v = var(); - const uint8_t even_mask = 0xA0; // 0b10100000 - const uint8_t odd_mask = 0xF5; // 0b11110101 + const uint8_t even_mask = 0xA0; // 0b10100000 + const uint8_t odd_mask = 0xF5; // 0b11110101 - uni_vshufps(u, uv, uv, even_mask); // u = uv[0,0,2,2,4,4,6,6] - uni_vshufps(v, uv, uv, odd_mask); // v = uv[1,1,3,3,5,5,7,7] + uni_vshufps(u, uv, uv, even_mask); // u = uv[0,0,2,2,4,4,6,6] + uni_vshufps(v, uv, uv, odd_mask); // v = uv[1,1,3,3,5,5,7,7] return std::make_tuple(std::move(u), std::move(v)); } -template -const jit_uni_converter & jit_converter_create() { +template +const jit_uni_converter& jit_converter_create() { auto createKernel = []() { std::unique_ptr kernel; @@ -540,22 +523,21 @@ const jit_uni_converter & jit_converter_create() { return *kernel; } -template -const jit_uni_converter & jit_converter_get() { +template +const jit_uni_converter& jit_converter_get() { return jit_converter_create(); } -template +template class SinglePlaneConvert : public Converter { public: - SinglePlaneConvert(Node *node) - : Converter(node) { + SinglePlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -574,23 +556,22 @@ class SinglePlaneConvert : public Converter { args.u = args.v = uv + batch * stride_uv + (h / 2) * width; args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; -template +template class TwoPlaneConvert : public Converter { public: - TwoPlaneConvert(Node *node) - : Converter(node) { + TwoPlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM]; @@ -609,46 +590,43 @@ class TwoPlaneConvert : public Converter { args.u = args.v = uv + batch * stride_uv + (h / 2) * width; args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; #endif -} // namespace nv12 +} // namespace nv12 namespace i420 { -ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node *node) { - const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 +ColorConvert::Converter::PrimitiveDescs supportedPrimitiveDescs(Node* node) { + const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 - const ov::element::Type precision = node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 - ? ov::element::u8 - : ov::element::f32; + const ov::element::Type precision = + node->getOriginalInputPrecisionAtPort(0) == ov::element::u8 ? ov::element::u8 : ov::element::f32; ColorConvert::Converter::PrimitiveDescs descs; - descs.emplace_back(std::vector { node->getOriginalInputsNumber(), { layout, precision } }, - std::vector { { layout, precision } }, - mayiuse(cpu_isa_t::sse41) - ? impl_desc_type::jit_uni - : impl_desc_type::ref, - true); + descs.emplace_back(std::vector{node->getOriginalInputsNumber(), {layout, precision}}, + std::vector{{layout, precision}}, + mayiuse(cpu_isa_t::sse41) ? impl_desc_type::jit_uni : impl_desc_type::ref, + true); return descs; } -template +template class SinglePlaneConvert; -template +template class ThreePlaneConvert; class RefConverter : public Converter { public: - RefConverter(Node *node); + RefConverter(Node* node); protected: - template + template void convert(const T* y, const T* u, const T* v, @@ -660,15 +638,14 @@ class RefConverter : public Converter { size_t stride_uv); }; -RefConverter::RefConverter(Node *node) - : Converter(node) { - if (node->getOriginalInputsNumber() != (singlePlane() ? 1: 3)) +RefConverter::RefConverter(Node* node) : Converter(node) { + if (node->getOriginalInputsNumber() != (singlePlane() ? 1 : 3)) OPENVINO_THROW("I420Converter node has incorrect number of inputs"); if (!node->getOriginalOutputsNumber()) OPENVINO_THROW("I420Converter node has incorrect number of outputs"); } -template +template void RefConverter::convert(const T* y, const T* u, const T* v, @@ -699,13 +676,13 @@ void RefConverter::convert(const T* y, }); } -template +template class SinglePlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -716,22 +693,17 @@ class SinglePlaneConvert : public RefConverter { const T* v = y + 5 * width * height / 4; T* dst = static_cast(output(0)); - convert(y, u, v, dst, - batch_size, - height, - width, - height * width * 3 / 2, - height * width * 3 / 2); + convert(y, u, v, dst, batch_size, height, width, height * width * 3 / 2, height * width * 3 / 2); } }; -template +template class ThreePlaneConvert : public RefConverter { public: using RefConverter::RefConverter; void execute(dnnl::stream strm) override { - const auto & dims = inputDims(0); + const auto& dims = inputDims(0); const T* y = static_cast(input(0)); const T* u = static_cast(input(1)); @@ -742,34 +714,25 @@ class ThreePlaneConvert : public RefConverter { const size_t height = dims[H_DIM]; const size_t width = dims[W_DIM]; - convert(y, u, v, dst, - batch_size, - height, - width, - height * width, - height * width / 4); + convert(y, u, v, dst, batch_size, height, width, height * width, height * width / 4); } }; #if defined(OPENVINO_ARCH_X86_64) -template +template class JitConverter; -template +template class JitConverter : public jit_uni_converter { private: void generate() override; - std::tuple, - variable, - variable> - load_yuv(const variable & src_y, - const variable & src_u, - const variable & src_v); - void unpack_uv(const variable & u, - const variable & v); + std::tuple, variable, variable> load_yuv(const variable& src_y, + const variable& src_u, + const variable& src_v); + void unpack_uv(const variable& u, const variable& v); }; -template +template void JitConverter::generate() { preamble(); @@ -781,7 +744,7 @@ void JitConverter::generate() { auto width = arg(&Params::width); auto colorFormat = arg(&Params::colorFormat); - static const float data[8] = { 16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f }; + static const float data[8] = {16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f}; _consts = data; const size_t reg_capacity_log = static_cast(std::logb(N)); @@ -789,26 +752,29 @@ void JitConverter::generate() { width >>= reg_capacity_log; - foreach(0, width, [&](const Reg64 & idx) { + foreach (0, width, [&](const Reg64& idx) { auto yuv = load_yuv(src_y, src_u, src_v); // Aliases - const auto & y = std::get<0>(yuv); - const auto & u = std::get<1>(yuv); - const auto & v = std::get<2>(yuv); + const auto& y = std::get<0>(yuv); + const auto& u = std::get<1>(yuv); + const auto& v = std::get<2>(yuv); yuv_to_rgb(y, u, v, colorFormat, std::is_integral::value); - store(dst, y); dst += step; - store(dst, u); dst += step; - store(dst, v); dst += step; - }); + store(dst, y); + dst += step; + store(dst, u); + dst += step; + store(dst, v); + dst += step; + }) + ; mov(width, argPtr(&Params::width)); width &= N - 1; - _if(width != 0) - ._then([&] { + _if(width != 0)._then([&] { auto y = var(); auto u = var(); auto v = var(); @@ -829,13 +795,11 @@ void JitConverter::generate() { postamble(); } -template -std::tuple, - jit_kernel::variable, - jit_kernel::variable> -JitConverter::load_yuv(const variable & src_y, - const variable & src_u, - const variable & src_v) { +template +std::tuple, jit_kernel::variable, jit_kernel::variable> +JitConverter::load_yuv(const variable& src_y, + const variable& src_u, + const variable& src_v) { auto y = var(); auto u = var(); auto v = var(); @@ -853,16 +817,15 @@ JitConverter::load_yuv(const variable & src_y, return std::make_tuple(std::move(y), std::move(u), std::move(v)); } -template -void JitConverter::unpack_uv(const variable & u, - const variable & v) { - static const uint8_t order[] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 }; +template +void JitConverter::unpack_uv(const variable& u, const variable& v) { + static const uint8_t order[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}; u.permute(order); v.permute(order); } -template -const jit_uni_converter & jit_converter_create() { +template +const jit_uni_converter& jit_converter_create() { auto createKernel = []() { std::unique_ptr kernel; @@ -890,22 +853,21 @@ const jit_uni_converter & jit_converter_create() { return *kernel; } -template -const jit_uni_converter & jit_converter_get() { +template +const jit_uni_converter& jit_converter_get() { return jit_converter_create(); } -template +template class SinglePlaneConvert : public Converter { public: - SinglePlaneConvert(Node *node) - : Converter(node) { + SinglePlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const size_t batch_size = dims[N_DIM]; const size_t height = dims[H_DIM] * 2 / 3; @@ -926,23 +888,22 @@ class SinglePlaneConvert : public Converter { args.v = v + batch * stride_uv + (h / 2) * (width / 2); args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; -template +template class ThreePlaneConvert : public Converter { public: - ThreePlaneConvert(Node *node) - : Converter(node) { + ThreePlaneConvert(Node* node) : Converter(node) { jit_converter_create(); } void execute(dnnl::stream strm) override { - const auto & kernel = jit_converter_get(); - const auto & dims = inputDims(0); + const auto& kernel = jit_converter_get(); + const auto& dims = inputDims(0); const T* y = static_cast(input(0)); const T* u = static_cast(input(1)); @@ -963,20 +924,19 @@ class ThreePlaneConvert : public Converter { args.v = v + batch * stride_uv + (h / 2) * (width / 2); args.dst = dst + (batch * width * height + h * width) * 3; args.width = width; - args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. kernel(args); }); } }; #endif -} // namespace i420 +} // namespace i420 -} // namespace +} // namespace -ColorConvert::Converter::Converter(Node *node, const ColorFormat & colorFormat) - : _node(node) - , _colorFormat(colorFormat) { -} +ColorConvert::Converter::Converter(Node* node, const ColorFormat& colorFormat) + : _node(node), + _colorFormat(colorFormat) {} ov::element::Type ColorConvert::Converter::inputPrecision(size_t idx) const { return _node->getParentEdgeAt(idx)->getMemory().getDesc().getPrecision(); @@ -986,15 +946,15 @@ ov::element::Type ColorConvert::Converter::outputPrecision(size_t idx) const { return _node->getChildEdgeAt(idx)->getMemory().getDesc().getPrecision(); } -const void * ColorConvert::Converter::input(size_t idx) const { +const void* ColorConvert::Converter::input(size_t idx) const { return _node->getSrcDataAtPort(idx); } -void * ColorConvert::Converter::output(size_t idx) const { +void* ColorConvert::Converter::output(size_t idx) const { return _node->getDstDataAtPort(idx); } -const VectorDims & ColorConvert::Converter::inputDims(size_t idx) const { +const VectorDims& ColorConvert::Converter::inputDims(size_t idx) const { return _node->getParentEdgeAt(idx)->getMemory().getStaticDims(); } @@ -1019,42 +979,42 @@ void ColorConvert::initSupportedPrimitiveDescriptors() { return; switch (algorithm) { - case Algorithm::ColorConvertNV12toRGB: - case Algorithm::ColorConvertNV12toBGR: { - for (const auto &desc : nv12::supportedPrimitiveDescs(this)) { - const auto & inPortConfigs = std::get<0>(desc); - const auto & outPortConfigs = std::get<1>(desc); - const auto implType = std::get<2>(desc); - addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); - } - initSupportedNV12Impls(); - break; + case Algorithm::ColorConvertNV12toRGB: + case Algorithm::ColorConvertNV12toBGR: { + for (const auto& desc : nv12::supportedPrimitiveDescs(this)) { + const auto& inPortConfigs = std::get<0>(desc); + const auto& outPortConfigs = std::get<1>(desc); + const auto implType = std::get<2>(desc); + addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); } - case Algorithm::ColorConvertI420toRGB: - case Algorithm::ColorConvertI420toBGR: { - for (const auto &desc : i420::supportedPrimitiveDescs(this)) { - const auto & inPortConfigs = std::get<0>(desc); - const auto & outPortConfigs = std::get<1>(desc); - const auto implType = std::get<2>(desc); - addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); - } - initSupportedI420Impls(); - break; + initSupportedNV12Impls(); + break; + } + case Algorithm::ColorConvertI420toRGB: + case Algorithm::ColorConvertI420toBGR: { + for (const auto& desc : i420::supportedPrimitiveDescs(this)) { + const auto& inPortConfigs = std::get<0>(desc); + const auto& outPortConfigs = std::get<1>(desc); + const auto implType = std::get<2>(desc); + addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType); } - default: - break; + initSupportedI420Impls(); + break; + } + default: + break; } } void ColorConvert::initSupportedNV12Impls() { - #define SUPPORTED_IMPL(Impl, type, desc_type) \ - [](Node *node) { \ - return new nv12::Impl(node); \ - }; +#define SUPPORTED_IMPL(Impl, type, desc_type) \ + [](Node* node) { \ + return new nv12::Impl(node); \ + }; // ref { - auto &impls = _supportedImpls[impl_desc_type::ref][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::ref][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, ref); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(TwoPlaneConvert, uint8_t, ref); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, ref); @@ -1064,25 +1024,25 @@ void ColorConvert::initSupportedNV12Impls() { #if defined(OPENVINO_ARCH_X86_64) // jit_uni { - auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(TwoPlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni); impls[ov::element::Type_t::f32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, jit_uni); } #endif - #undef SUPPORTED_IMPL +#undef SUPPORTED_IMPL } void ColorConvert::initSupportedI420Impls() { - #define SUPPORTED_IMPL(Impl, type, desc_type) \ - [](Node *node) { \ - return new i420::Impl(node); \ - }; +#define SUPPORTED_IMPL(Impl, type, desc_type) \ + [](Node* node) { \ + return new i420::Impl(node); \ + }; // ref { - auto &impls = _supportedImpls[impl_desc_type::ref][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::ref][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, ref); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(ThreePlaneConvert, uint8_t, ref); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, ref); @@ -1092,32 +1052,29 @@ void ColorConvert::initSupportedI420Impls() { #if defined(OPENVINO_ARCH_X86_64) // jit_uni { - auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; + auto& impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; impls[ov::element::Type_t::u8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::u8][false] = SUPPORTED_IMPL(ThreePlaneConvert, uint8_t, jit_uni); impls[ov::element::Type_t::f32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni); impls[ov::element::Type_t::f32][false] = SUPPORTED_IMPL(ThreePlaneConvert, float, jit_uni); } #endif - #undef SUPPORTED_IMPL +#undef SUPPORTED_IMPL } void ColorConvert::createPrimitive() { - const NodeDesc *desc = getSelectedPrimitiveDescriptor(); + const NodeDesc* desc = getSelectedPrimitiveDescriptor(); if (!desc) OPENVINO_THROW(getTypeStr() + " node with name '" + getName() + "' ", "no optimal primitive descriptor selected"); if (!_impl) { - const auto & cfg = desc->getConfig(); + const auto& cfg = desc->getConfig(); const auto precision = cfg.inConfs[0].getMemDesc()->getPrecision(); const bool isSinglePlane = cfg.inConfs.size() == 1; - _impl = std::unique_ptr(_supportedImpls - .at(desc->getImplementationType()) - .at(algorithm) - .at(precision) - .at(isSinglePlane)(this)); + _impl = std::unique_ptr( + _supportedImpls.at(desc->getImplementationType()).at(algorithm).at(precision).at(isSinglePlane)(this)); } } @@ -1139,6 +1096,6 @@ void ColorConvert::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/color_convert.h b/src/plugins/intel_cpu/src/nodes/color_convert.h index 19df1209dd4bab..9bd27c7cf9dffa 100644 --- a/src/plugins/intel_cpu/src/nodes/color_convert.h +++ b/src/plugins/intel_cpu/src/nodes/color_convert.h @@ -5,10 +5,11 @@ #pragma once #include -#include + +#include #include #include -#include +#include namespace ov { namespace intel_cpu { @@ -35,11 +36,11 @@ class ColorConvert : public Node { void initSupportedI420Impls(); private: - using ConverterBuilder = std::function; - using SupportedImpls = multidim_map; + using SupportedImpls = multidim_map; std::unique_ptr _impl; @@ -48,10 +49,11 @@ class ColorConvert : public Node { class ColorConvert::Converter { public: - using PrimitiveDescs = std::vector, // Input port configurator - std::vector, // Output port configurator - impl_desc_type, // Implementation type - bool>>; // // true - SinglePlaneConvert, false - TwoPlaneConvert/ThreePlaneConvert + using PrimitiveDescs = + std::vector, // Input port configurator + std::vector, // Output port configurator + impl_desc_type, // Implementation type + bool>>; // // true - SinglePlaneConvert, false - TwoPlaneConvert/ThreePlaneConvert using Shapes = std::vector; static constexpr size_t N_DIM = 0; @@ -61,20 +63,20 @@ class ColorConvert::Converter { using ColorFormat = std::array; - Converter(Node *node, const ColorFormat & colorFormat); + Converter(Node* node, const ColorFormat& colorFormat); virtual ~Converter() = default; ov::element::Type inputPrecision(size_t idx) const; ov::element::Type outputPrecision(size_t idx) const; - const void * input(size_t idx) const; - void * output(size_t idx) const; - const VectorDims & inputDims(size_t idx) const; + const void* input(size_t idx) const; + void* output(size_t idx) const; + const VectorDims& inputDims(size_t idx) const; virtual void execute(dnnl::stream strm) = 0; protected: - Node *_node; - ColorFormat _colorFormat; // RGB: {0,1,2}, BGR: {2,1,0} + Node* _node; + ColorFormat _colorFormat; // RGB: {0,1,2}, BGR: {2,1,0} }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp index a7d3adc50d62e3..5887900ce8fa9e 100644 --- a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.cpp @@ -3,26 +3,26 @@ // #include "arbitrary_order_desc_creator.h" + #include "utils/general_utils.h" namespace ov { namespace intel_cpu { -ArbitraryOrderDescCreator::ArbitraryOrderDescCreator(VectorDims order) : - m_order(std::move(order)) { +ArbitraryOrderDescCreator::ArbitraryOrderDescCreator(VectorDims order) : m_order(std::move(order)) { OPENVINO_ASSERT(std::adjacent_find(m_order.begin(), m_order.end()) == m_order.end(), - "Can't construct ArbitraryOrderDescCreator, order vector contains repetitive elements", - vec2str(m_order)); + "Can't construct ArbitraryOrderDescCreator, order vector contains repetitive elements", + vec2str(m_order)); } -CpuBlockedMemoryDesc -ArbitraryOrderDescCreator::createDesc(const ov::element::Type& precision, const Shape& srcShape) const { +CpuBlockedMemoryDesc ArbitraryOrderDescCreator::createDesc(const ov::element::Type& precision, + const Shape& srcShape) const { auto&& dims = srcShape.getDims(); OPENVINO_ASSERT(dims.size() == m_order.size(), - "Couldn't create a tensor descriptor, shape and order size mismatch. Shape: ", - vec2str(dims), - " order: ", - vec2str(m_order)); + "Couldn't create a tensor descriptor, shape and order size mismatch. Shape: ", + vec2str(dims), + " order: ", + vec2str(m_order)); VectorDims blkDims(dims.size()); for (size_t i = 0; i < dims.size(); ++i) { @@ -36,5 +36,5 @@ size_t ArbitraryOrderDescCreator::getMinimalRank() const { return m_order.size(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h index aaf5a7d5560799..c7341169fd9187 100644 --- a/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h +++ b/src/plugins/intel_cpu/src/nodes/common/arbitrary_order_desc_creator.h @@ -20,5 +20,5 @@ class ArbitraryOrderDescCreator : public BlockedDescCreator { VectorDims m_order; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp index 88c351ecafbdc1..a7398cac1e9940 100644 --- a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.cpp @@ -3,9 +3,8 @@ // #include "blocked_desc_creator.h" -#include - +#include namespace ov { namespace intel_cpu { @@ -15,17 +14,19 @@ constexpr size_t channelsPos = 1lu; class PlainFormatCreator : public BlockedDescCreator { public: - CpuBlockedMemoryDesc createDesc(const ov::element::Type &precision, const Shape& srcShape) const override { + CpuBlockedMemoryDesc createDesc(const ov::element::Type& precision, const Shape& srcShape) const override { VectorDims order(srcShape.getRank()); std::iota(order.begin(), order.end(), 0); return CpuBlockedMemoryDesc(precision, srcShape, srcShape.getDims(), order); } - size_t getMinimalRank() const override { return 0lu; } + size_t getMinimalRank() const override { + return 0lu; + } }; class PerChannelCreator : public BlockedDescCreator { public: - CpuBlockedMemoryDesc createDesc(const ov::element::Type &precision, const Shape& srcShape) const override { + CpuBlockedMemoryDesc createDesc(const ov::element::Type& precision, const Shape& srcShape) const override { VectorDims order(srcShape.getRank()); std::iota(order.begin(), order.end(), 0); VectorDims blkDims = srcShape.getDims(); @@ -41,7 +42,9 @@ class PerChannelCreator : public BlockedDescCreator { return CpuBlockedMemoryDesc(precision, srcShape, blkDims, order); } - size_t getMinimalRank() const override { return 3lu; } + size_t getMinimalRank() const override { + return 3lu; + } }; class ChannelBlockedCreator : public BlockedDescCreator { @@ -64,24 +67,27 @@ class ChannelBlockedCreator : public BlockedDescCreator { return CpuBlockedMemoryDesc(precision, srcShape, blkDims, order); } - size_t getMinimalRank() const override { return 3lu; } + size_t getMinimalRank() const override { + return 3lu; + } private: size_t _blockSize; }; -} // namespace +} // namespace const BlockedDescCreator::CreatorsMap& BlockedDescCreator::getCommonCreators() { - static const CreatorsMap map{ { LayoutType::nspc, CreatorConstPtr(new PerChannelCreator) }, - { LayoutType::nCsp8c, CreatorConstPtr(new ChannelBlockedCreator(8)) }, - { LayoutType::nCsp16c, CreatorConstPtr(new ChannelBlockedCreator(16)) }, - { LayoutType::ncsp, CreatorConstPtr(new PlainFormatCreator) } }; + static const CreatorsMap map{{LayoutType::nspc, CreatorConstPtr(new PerChannelCreator)}, + {LayoutType::nCsp8c, CreatorConstPtr(new ChannelBlockedCreator(8))}, + {LayoutType::nCsp16c, CreatorConstPtr(new ChannelBlockedCreator(16))}, + {LayoutType::ncsp, CreatorConstPtr(new PlainFormatCreator)}}; return map; } -std::pair -BlockedDescCreator::makeFilteredRange(const CreatorsMap &map, unsigned int rank) { +std::pair BlockedDescCreator::makeFilteredRange( + const CreatorsMap& map, + unsigned int rank) { auto rankFilter = [rank](const CreatorsMap::value_type& item) { if (item.second->getMinimalRank() > rank) { return false; @@ -94,8 +100,10 @@ BlockedDescCreator::makeFilteredRange(const CreatorsMap &map, unsigned int rank) return std::make_pair(first, last); } -std::pair -BlockedDescCreator::makeFilteredRange(const CreatorsMap& map, unsigned rank, const std::vector& supportedTypes) { +std::pair BlockedDescCreator::makeFilteredRange( + const CreatorsMap& map, + unsigned rank, + const std::vector& supportedTypes) { unsigned bitMask = 0ul; for (auto& item : supportedTypes) { bitMask |= 1 << static_cast(item); @@ -116,12 +124,13 @@ BlockedDescCreator::makeFilteredRange(const CreatorsMap& map, unsigned rank, con return std::make_pair(first, last); } -std::pair -BlockedDescCreator::makeFilteredRange(const CreatorsMap &map, BlockedDescCreator::Predicate predicate) { +std::pair BlockedDescCreator::makeFilteredRange( + const CreatorsMap& map, + BlockedDescCreator::Predicate predicate) { auto first = CreatorsMapFilterConstIterator(std::move(predicate), map.begin(), map.end()); auto last = first.end(); return std::make_pair(first, last); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h index 1fd7a02dff984b..9f8b15b430c727 100644 --- a/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h +++ b/src/plugins/intel_cpu/src/nodes/common/blocked_desc_creator.h @@ -5,6 +5,7 @@ #pragma once #include + #include "cpu_shape.h" #include "memory_desc/cpu_blocked_memory_desc.h" @@ -22,15 +23,18 @@ class BlockedDescCreator { public: static const CreatorsMap& getCommonCreators(); - static std::pair - makeFilteredRange(const CreatorsMap &map, unsigned rank); + static std::pair makeFilteredRange( + const CreatorsMap& map, + unsigned rank); static std::pair makeFilteredRange(const CreatorsMap& map, unsigned rank, const std::vector& supportedTypes); - static std::pair - makeFilteredRange(const CreatorsMap& map, Predicate predicate); + static std::pair makeFilteredRange( + const CreatorsMap& map, + Predicate predicate); virtual CpuBlockedMemoryDesc createDesc(const ov::element::Type& precision, const Shape& srcShape) const = 0; - std::shared_ptr createSharedDesc(const ov::element::Type& precision, const Shape& srcShape) const { + std::shared_ptr createSharedDesc(const ov::element::Type& precision, + const Shape& srcShape) const { return std::make_shared(createDesc(precision, srcShape)); } @@ -49,7 +53,10 @@ class CreatorsMapFilterConstIterator { typedef std::function predicate_type; public: - CreatorsMapFilterConstIterator(predicate_type filter, Iterator begin, Iterator end) : _iter(begin), _end(end), _filter(std::move(filter)) { + CreatorsMapFilterConstIterator(predicate_type filter, Iterator begin, Iterator end) + : _iter(begin), + _end(end), + _filter(std::move(filter)) { while (_iter != _end && !_filter(*_iter)) { ++_iter; } @@ -93,5 +100,5 @@ class CreatorsMapFilterConstIterator { predicate_type _filter; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp index ad0738e9d57558..a0590827006eb4 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp @@ -5,16 +5,16 @@ #include "cpu_convert.h" #include "cpu_memcpy.h" -#include "utils/bfloat16.hpp" #include "openvino/core/type/nf4.hpp" +#include "utils/bfloat16.hpp" #if defined(OPENVINO_ARCH_X86_64) -#include "nodes/kernels/x64/jit_kernel.hpp" +# include "nodes/kernels/x64/jit_kernel.hpp" #else -#include "cpu_memory.h" -#include "openvino/core/type/element_type_traits.hpp" -#include "selective_build.h" -#include "utils/general_utils.h" +# include "cpu_memory.h" +# include "openvino/core/type/element_type_traits.hpp" +# include "selective_build.h" +# include "utils/general_utils.h" #endif namespace ov { @@ -28,16 +28,12 @@ using namespace dnnl::impl::cpu::x64; using namespace Xbyak; template -void convert_vec(jit_generator & gen, - const RegExp & src, - const RegExp & dst); +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst); template <> -void convert_vec(jit_generator & gen, - const RegExp & src, - const RegExp & dst) { - auto const & f16vec = gen.xmm3; - auto const & f32vec = gen.ymm4; +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f16vec = gen.xmm3; + auto const& f32vec = gen.ymm4; gen.movdqu(f16vec, gen.xword[src]); gen.vcvtph2ps(f32vec, f16vec); @@ -45,11 +41,9 @@ void convert_vec(jit_generator & gen, } template <> -void convert_vec(jit_generator & gen, - const RegExp & src, - const RegExp & dst) { - auto const & f16vec = gen.xmm3; - auto const & f32vec = gen.ymm4; +void convert_vec(jit_generator& gen, const RegExp& src, const RegExp& dst) { + auto const& f16vec = gen.xmm3; + auto const& f32vec = gen.ymm4; gen.vmovups(f32vec, gen.yword[src]); gen.vcvtps2ph(f16vec, f32vec, 0); @@ -72,18 +66,18 @@ class jit_convert_array : public jit_kernel { size >>= vlen_log2; - foreach(0, size, [&, this](const Xbyak::Reg64& idx) { + foreach (0, size, [&, this](const Xbyak::Reg64& idx) { _convert_vec(*this, src, dst); src += _src_size * vlen; dst += _dst_size * vlen; - }); + }) + ; mov(size, argPtr(&args_t::count)); size &= vlen - 1; // Tail conversion - _if(size != 0) - ._then([&] { + _if(size != 0)._then([&] { auto tmp = stack(vlen * sizeof(float)); tmp.clear(); @@ -112,24 +106,19 @@ class jit_convert_array : public jit_kernel { typedef void (*fn_t)(const args_t*); - typedef void (*convert_vec_t)(jit_generator &, - const RegExp &, - const RegExp &); + typedef void (*convert_vec_t)(jit_generator&, const RegExp&, const RegExp&); - jit_convert_array(convert_vec_t convert_vec, - size_t src_size, - size_t dst_size) - : jit_kernel(jit_name()) - , _convert_vec(convert_vec) - , _src_size(src_size) - , _dst_size(dst_size) {} + jit_convert_array(convert_vec_t convert_vec, size_t src_size, size_t dst_size) + : jit_kernel(jit_name()), + _convert_vec(convert_vec), + _src_size(src_size), + _dst_size(dst_size) {} - template + template static fn_t get() { - if (mayiuse(cpu_isa_t::avx2) - && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) { + if (mayiuse(cpu_isa_t::avx2) && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) { static jit_convert_array converter(convert_vec, sizeof(src_t), sizeof(dst_t)); - auto & generator = static_cast(converter); + auto& generator = static_cast(converter); generator.create_kernel(); return (fn_t)generator.jit_ker(); } @@ -148,7 +137,7 @@ void jit_convert(const TI* arg, TO* out, size_t count) { static auto converter = jit_impl::get(); if (converter) { - typename jit_impl::args_t args = { arg, out, count }; + typename jit_impl::args_t args = {arg, out, count}; converter(&args); } else { for (size_t i = 0; i < count; ++i) { @@ -179,44 +168,41 @@ struct PrecisionInfo { using value_type = uint8_t; }; -template::value - || std::is_same::value, - float, T>::type> +template ::value || + std::is_same::value, + float, + T>::type> struct Range { - const std::tuple & fit(const ov::element::Type & prec); + const std::tuple& fit(const ov::element::Type& prec); private: - std::tuple _range { - std::numeric_limits::lowest(), - std::numeric_limits::max() - }; + std::tuple _range{std::numeric_limits::lowest(), std::numeric_limits::max()}; }; -template -const std::tuple & Range::fit(const ov::element::Type & prec) { +template +const std::tuple& Range::fit(const ov::element::Type& prec) { if (prec.is_real()) { double lbound, ubound; switch (prec) { - case ov::element::bf16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::f16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::f32: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::f64: - lbound = std::numeric_limits::lowest(); - ubound = std::numeric_limits::max(); - break; - default: - OPENVINO_THROW("Unsupported precision"); + case ov::element::bf16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::f16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::f32: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::f64: + lbound = std::numeric_limits::lowest(); + ubound = std::numeric_limits::max(); + break; + default: + OPENVINO_THROW("Unsupported precision"); } // If U is integral, its range always less than float, so not need update _range // Else it will be overflow, for example static_cast double to int64_t: @@ -224,73 +210,71 @@ const std::tuple & Range::fit(const ov::element::Type & prec) { // double dd_ubound = static_cast(ubbound) // static_cast(dd_ubound) will return -9223372036854775808 if (!std::is_integral::value) { - std::get<0>(_range) = static_cast(std::max(static_cast(std::get<0>(_range)), lbound)); - std::get<1>(_range) = static_cast(std::min(static_cast(std::get<1>(_range)), ubound)); + std::get<0>(_range) = static_cast(std::max(static_cast(std::get<0>(_range)), lbound)); + std::get<1>(_range) = static_cast(std::min(static_cast(std::get<1>(_range)), ubound)); } } else { int64_t lbound; uint64_t ubound; switch (prec) { - case ov::element::boolean: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u8: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i8: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u32: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i32: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::u64: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - case ov::element::i64: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); - break; - default: - OPENVINO_THROW("Unsupported precision"); + case ov::element::boolean: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u8: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i8: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i16: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u32: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i32: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::u64: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + case ov::element::i64: + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); + break; + default: + OPENVINO_THROW("Unsupported precision"); } - using ltype = typename std::conditional< - std::is_floating_point::value, - double, int64_t>::type; - using utype = typename std::conditional< - std::is_floating_point::value, - double, uint64_t>::type; - std::get<0>(_range) = static_cast(std::max(static_cast(std::get<0>(_range)), static_cast(lbound))); - std::get<1>(_range) = static_cast(std::min(static_cast(std::get<1>(_range)), static_cast(ubound))); + using ltype = typename std::conditional::value, double, int64_t>::type; + using utype = typename std::conditional::value, double, uint64_t>::type; + std::get<0>(_range) = + static_cast(std::max(static_cast(std::get<0>(_range)), static_cast(lbound))); + std::get<1>(_range) = + static_cast(std::min(static_cast(std::get<1>(_range)), static_cast(ubound))); } return _range; } struct ConvertContext { - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; ov::element::Type interimPrc; ov::element::Type dstPrc; bool converted; - template + template std::tuple range() const { Range r; r.fit(interimPrc); @@ -298,20 +282,18 @@ struct ConvertContext { } }; -template +template struct ConvertPrecision; -template +template struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); src_t lbound, ubound; std::tie(lbound, ubound) = ctx.range(); - if (std::is_integral::value - || ctx.interimPrc.is_real() - || std::is_integral::value) { + if (std::is_integral::value || ctx.interimPrc.is_real() || std::is_integral::value) { parallel_for(ctx.size, [&](size_t i) { dst[i] = static_cast(std::max(std::min(src[i], ubound), lbound)); }); @@ -325,11 +307,11 @@ struct ConvertPrecision> { } }; -template<> +template <> struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); if (ctx.interimPrc.is_real()) { parallel_for(ctx.size, [&](size_t i) { @@ -347,11 +329,11 @@ struct ConvertPrecision> { } }; -template<> +template <> struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); if (ctx.interimPrc.is_real()) { parallel_for(ctx.size, [&](size_t i) { @@ -370,11 +352,11 @@ struct ConvertPrecision> { }; #if defined(OPENVINO_ARCH_X86_64) -template +template struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); constexpr size_t batch = 64; const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); @@ -388,16 +370,16 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 tmp[j] = static_cast(std::max(std::min(src[offset + j], ubound), lbound)); - jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 + jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 }); } else if (ctx.interimPrc.is_real()) { parallel_for(iterations, [&](size_t i) { const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); if (std::is_same::type, float>::value) { // fp32 -> fp16 - jit_convert(reinterpret_cast(src) + offset, dst + offset, current_batch_size); + jit_convert(reinterpret_cast(src) + offset, dst + offset, current_batch_size); } else { batch_type tmp; for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 @@ -410,9 +392,9 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // src_t -> fp32 tmp[j] = static_cast(std::trunc(std::max(std::min(src[offset + j], ubound), lbound))); - jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 + jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 }); } @@ -420,11 +402,11 @@ struct ConvertPrecision> { } }; -template +template struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); constexpr size_t batch = 64; const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); @@ -438,8 +420,8 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 - for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t + jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t dst[offset + j] = static_cast(std::max(std::min(tmp[j], ubound), lbound)); }); } else if (ctx.interimPrc.is_real()) { @@ -447,7 +429,7 @@ struct ConvertPrecision> { const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); if (std::is_same::type, float>::value) { // fp16 -> fp32 - jit_convert(src + offset, reinterpret_cast(dst) + offset, current_batch_size); + jit_convert(src + offset, reinterpret_cast(dst) + offset, current_batch_size); } else { batch_type tmp; jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 @@ -460,8 +442,8 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 - for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t + jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // fp32 -> dst_t dst[offset + j] = static_cast(std::trunc(std::max(std::min(tmp[j], ubound), lbound))); }); } @@ -470,11 +452,11 @@ struct ConvertPrecision> { } }; -template<> +template <> struct ConvertPrecision> { - void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); constexpr size_t batch = 64; const size_t iterations = ov::intel_cpu::div_up(ctx.size, batch); @@ -490,10 +472,10 @@ struct ConvertPrecision> { batch_type tmp; const size_t offset = i * batch; const size_t current_batch_size = std::min(ctx.size - offset, batch); - jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 - for (size_t j = 0; j < current_batch_size; ++j) // truncate fp32 + jit_convert(src + offset, tmp, current_batch_size); // fp16 -> fp32 + for (size_t j = 0; j < current_batch_size; ++j) // truncate fp32 tmp[j] = std::trunc(std::max(std::min(tmp[j], ubound), lbound)); - jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 + jit_convert(tmp, dst + offset, current_batch_size); // fp32 -> fp16 }); } @@ -502,7 +484,7 @@ struct ConvertPrecision> { }; #endif -} // namespace +} // namespace #define INTEL_CPU_CVT(ST, DT) \ OV_CASE2(ov::element::ST, \ @@ -510,74 +492,72 @@ struct ConvertPrecision> { PrecisionInfo::value_type, \ PrecisionInfo::value_type) -#define INTEL_CPU_CVT_LIST \ - INTEL_CPU_CVT(u8, i8), INTEL_CPU_CVT(u8, u16), INTEL_CPU_CVT(u8, i16), INTEL_CPU_CVT(u8, u32), \ - INTEL_CPU_CVT(u8, i32), INTEL_CPU_CVT(u8, u64), INTEL_CPU_CVT(u8, i64), INTEL_CPU_CVT(u8, f32), \ - INTEL_CPU_CVT(u8, f16), INTEL_CPU_CVT(u8, bf16), INTEL_CPU_CVT(u8, f64), INTEL_CPU_CVT(u8, boolean), \ - INTEL_CPU_CVT(i8, u8), INTEL_CPU_CVT(i8, u16), INTEL_CPU_CVT(i8, i16), INTEL_CPU_CVT(i8, u32), \ - INTEL_CPU_CVT(i8, i32), INTEL_CPU_CVT(i8, u64), INTEL_CPU_CVT(i8, i64), INTEL_CPU_CVT(i8, f32), \ - INTEL_CPU_CVT(i8, f16), INTEL_CPU_CVT(i8, bf16), INTEL_CPU_CVT(i8, f64), INTEL_CPU_CVT(i8, boolean), \ - INTEL_CPU_CVT(u16, u8), INTEL_CPU_CVT(u16, i8), INTEL_CPU_CVT(u16, i16), INTEL_CPU_CVT(u16, u32), \ - INTEL_CPU_CVT(u16, i32), INTEL_CPU_CVT(u16, u64), INTEL_CPU_CVT(u16, i64), INTEL_CPU_CVT(u16, f32), \ - INTEL_CPU_CVT(u16, f16), INTEL_CPU_CVT(u16, bf16), INTEL_CPU_CVT(u16, f64), INTEL_CPU_CVT(u16, boolean), \ - INTEL_CPU_CVT(i16, u8), INTEL_CPU_CVT(i16, i8), INTEL_CPU_CVT(i16, u16), INTEL_CPU_CVT(i16, u32), \ - INTEL_CPU_CVT(i16, i32), INTEL_CPU_CVT(i16, u64), INTEL_CPU_CVT(i16, i64), INTEL_CPU_CVT(i16, f32), \ - INTEL_CPU_CVT(i16, f16), INTEL_CPU_CVT(i16, bf16), INTEL_CPU_CVT(i16, f64), INTEL_CPU_CVT(i16, boolean), \ - INTEL_CPU_CVT(u32, u8), INTEL_CPU_CVT(u32, i8), INTEL_CPU_CVT(u32, u16), INTEL_CPU_CVT(u32, i16), \ - INTEL_CPU_CVT(u32, i32), INTEL_CPU_CVT(u32, u64), INTEL_CPU_CVT(u32, i64), INTEL_CPU_CVT(u32, f32), \ - INTEL_CPU_CVT(u32, f16), INTEL_CPU_CVT(u32, bf16), INTEL_CPU_CVT(u32, f64), INTEL_CPU_CVT(u32, boolean), \ - INTEL_CPU_CVT(i32, u8), INTEL_CPU_CVT(i32, i8), INTEL_CPU_CVT(i32, u16), INTEL_CPU_CVT(i32, i16), \ - INTEL_CPU_CVT(i32, u32), INTEL_CPU_CVT(i32, u64), INTEL_CPU_CVT(i32, i64), INTEL_CPU_CVT(i32, f32), \ - INTEL_CPU_CVT(i32, f16), INTEL_CPU_CVT(i32, bf16), INTEL_CPU_CVT(i32, f64), INTEL_CPU_CVT(i32, boolean), \ - INTEL_CPU_CVT(u64, u8), INTEL_CPU_CVT(u64, i8), INTEL_CPU_CVT(u64, u16), INTEL_CPU_CVT(u64, i16), \ - INTEL_CPU_CVT(u64, u32), INTEL_CPU_CVT(u64, i32), INTEL_CPU_CVT(u64, i64), INTEL_CPU_CVT(u64, f32), \ - INTEL_CPU_CVT(u64, f16), INTEL_CPU_CVT(u64, bf16), INTEL_CPU_CVT(u64, f64), INTEL_CPU_CVT(u64, boolean), \ - INTEL_CPU_CVT(i64, u8), INTEL_CPU_CVT(i64, i8), INTEL_CPU_CVT(i64, u16), INTEL_CPU_CVT(i64, i16), \ - INTEL_CPU_CVT(i64, u32), INTEL_CPU_CVT(i64, i32), INTEL_CPU_CVT(i64, u64), INTEL_CPU_CVT(i64, f32), \ - INTEL_CPU_CVT(i64, f16), INTEL_CPU_CVT(i64, bf16), INTEL_CPU_CVT(i64, f64), INTEL_CPU_CVT(i64, boolean), \ - INTEL_CPU_CVT(f32, u8), INTEL_CPU_CVT(f32, i8), INTEL_CPU_CVT(f32, u16), INTEL_CPU_CVT(f32, i16), \ - INTEL_CPU_CVT(f32, u32), INTEL_CPU_CVT(f32, i32), INTEL_CPU_CVT(f32, u64), INTEL_CPU_CVT(f32, i64), \ - INTEL_CPU_CVT(f32, f16), INTEL_CPU_CVT(f32, bf16), INTEL_CPU_CVT(f32, f64), INTEL_CPU_CVT(f32, boolean), \ - INTEL_CPU_CVT(f16, u8), INTEL_CPU_CVT(f16, i8), INTEL_CPU_CVT(f16, u16), INTEL_CPU_CVT(f16, i16), \ - INTEL_CPU_CVT(f16, u32), INTEL_CPU_CVT(f16, i32), INTEL_CPU_CVT(f16, u64), INTEL_CPU_CVT(f16, i64), \ - INTEL_CPU_CVT(f16, f32), INTEL_CPU_CVT(f16, bf16), INTEL_CPU_CVT(f16, f64), INTEL_CPU_CVT(f16, boolean), \ - INTEL_CPU_CVT(bf16, u8), INTEL_CPU_CVT(bf16, i8), INTEL_CPU_CVT(bf16, u16), INTEL_CPU_CVT(bf16, i16), \ - INTEL_CPU_CVT(bf16, u32), INTEL_CPU_CVT(bf16, i32), INTEL_CPU_CVT(bf16, u64), INTEL_CPU_CVT(bf16, i64), \ - INTEL_CPU_CVT(bf16, f32), INTEL_CPU_CVT(bf16, f16), INTEL_CPU_CVT(bf16, f64), INTEL_CPU_CVT(bf16, boolean), \ - INTEL_CPU_CVT(f64, u8), INTEL_CPU_CVT(f64, i8), INTEL_CPU_CVT(f64, u16), INTEL_CPU_CVT(f64, i16), \ - INTEL_CPU_CVT(f64, u32), INTEL_CPU_CVT(f64, i32), INTEL_CPU_CVT(f64, u64), INTEL_CPU_CVT(f64, i64), \ - INTEL_CPU_CVT(f64, f32), INTEL_CPU_CVT(f64, f16), INTEL_CPU_CVT(f64, bf16), INTEL_CPU_CVT(f64, boolean), \ - INTEL_CPU_CVT(boolean, u8), INTEL_CPU_CVT(boolean, i8), INTEL_CPU_CVT(boolean, u16), \ - INTEL_CPU_CVT(boolean, i16), INTEL_CPU_CVT(boolean, u32), INTEL_CPU_CVT(boolean, i32), \ - INTEL_CPU_CVT(boolean, u64), INTEL_CPU_CVT(boolean, i64), INTEL_CPU_CVT(boolean, f32), \ - INTEL_CPU_CVT(boolean, f16), INTEL_CPU_CVT(boolean, bf16), INTEL_CPU_CVT(boolean, f64), INTEL_CPU_CVT(u8, u8), \ - INTEL_CPU_CVT(i8, i8), INTEL_CPU_CVT(u16, u16), INTEL_CPU_CVT(i16, i16), INTEL_CPU_CVT(u32, u32), \ - INTEL_CPU_CVT(i32, i32), INTEL_CPU_CVT(u64, u64), INTEL_CPU_CVT(i64, i64), INTEL_CPU_CVT(f32, f32), \ - INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean) - - -#define INTEL_CPU_CVT_FROM_BIN_LIST \ - INTEL_CPU_CVT(u1, f32), INTEL_CPU_CVT(u1, f16), INTEL_CPU_CVT(u1, bf16), \ - INTEL_CPU_CVT(u1, f64), INTEL_CPU_CVT(u1, i16), INTEL_CPU_CVT(u1, u8), \ - INTEL_CPU_CVT(u1, i8), INTEL_CPU_CVT(u1, u16), INTEL_CPU_CVT(u1, i32), \ - INTEL_CPU_CVT(u1, u32), INTEL_CPU_CVT(u1, i64), INTEL_CPU_CVT(u1, u64), \ - INTEL_CPU_CVT(u1, boolean) +#define INTEL_CPU_CVT_LIST \ + INTEL_CPU_CVT(u8, i8), INTEL_CPU_CVT(u8, u16), INTEL_CPU_CVT(u8, i16), INTEL_CPU_CVT(u8, u32), \ + INTEL_CPU_CVT(u8, i32), INTEL_CPU_CVT(u8, u64), INTEL_CPU_CVT(u8, i64), INTEL_CPU_CVT(u8, f32), \ + INTEL_CPU_CVT(u8, f16), INTEL_CPU_CVT(u8, bf16), INTEL_CPU_CVT(u8, f64), INTEL_CPU_CVT(u8, boolean), \ + INTEL_CPU_CVT(i8, u8), INTEL_CPU_CVT(i8, u16), INTEL_CPU_CVT(i8, i16), INTEL_CPU_CVT(i8, u32), \ + INTEL_CPU_CVT(i8, i32), INTEL_CPU_CVT(i8, u64), INTEL_CPU_CVT(i8, i64), INTEL_CPU_CVT(i8, f32), \ + INTEL_CPU_CVT(i8, f16), INTEL_CPU_CVT(i8, bf16), INTEL_CPU_CVT(i8, f64), INTEL_CPU_CVT(i8, boolean), \ + INTEL_CPU_CVT(u16, u8), INTEL_CPU_CVT(u16, i8), INTEL_CPU_CVT(u16, i16), INTEL_CPU_CVT(u16, u32), \ + INTEL_CPU_CVT(u16, i32), INTEL_CPU_CVT(u16, u64), INTEL_CPU_CVT(u16, i64), INTEL_CPU_CVT(u16, f32), \ + INTEL_CPU_CVT(u16, f16), INTEL_CPU_CVT(u16, bf16), INTEL_CPU_CVT(u16, f64), INTEL_CPU_CVT(u16, boolean), \ + INTEL_CPU_CVT(i16, u8), INTEL_CPU_CVT(i16, i8), INTEL_CPU_CVT(i16, u16), INTEL_CPU_CVT(i16, u32), \ + INTEL_CPU_CVT(i16, i32), INTEL_CPU_CVT(i16, u64), INTEL_CPU_CVT(i16, i64), INTEL_CPU_CVT(i16, f32), \ + INTEL_CPU_CVT(i16, f16), INTEL_CPU_CVT(i16, bf16), INTEL_CPU_CVT(i16, f64), INTEL_CPU_CVT(i16, boolean), \ + INTEL_CPU_CVT(u32, u8), INTEL_CPU_CVT(u32, i8), INTEL_CPU_CVT(u32, u16), INTEL_CPU_CVT(u32, i16), \ + INTEL_CPU_CVT(u32, i32), INTEL_CPU_CVT(u32, u64), INTEL_CPU_CVT(u32, i64), INTEL_CPU_CVT(u32, f32), \ + INTEL_CPU_CVT(u32, f16), INTEL_CPU_CVT(u32, bf16), INTEL_CPU_CVT(u32, f64), INTEL_CPU_CVT(u32, boolean), \ + INTEL_CPU_CVT(i32, u8), INTEL_CPU_CVT(i32, i8), INTEL_CPU_CVT(i32, u16), INTEL_CPU_CVT(i32, i16), \ + INTEL_CPU_CVT(i32, u32), INTEL_CPU_CVT(i32, u64), INTEL_CPU_CVT(i32, i64), INTEL_CPU_CVT(i32, f32), \ + INTEL_CPU_CVT(i32, f16), INTEL_CPU_CVT(i32, bf16), INTEL_CPU_CVT(i32, f64), INTEL_CPU_CVT(i32, boolean), \ + INTEL_CPU_CVT(u64, u8), INTEL_CPU_CVT(u64, i8), INTEL_CPU_CVT(u64, u16), INTEL_CPU_CVT(u64, i16), \ + INTEL_CPU_CVT(u64, u32), INTEL_CPU_CVT(u64, i32), INTEL_CPU_CVT(u64, i64), INTEL_CPU_CVT(u64, f32), \ + INTEL_CPU_CVT(u64, f16), INTEL_CPU_CVT(u64, bf16), INTEL_CPU_CVT(u64, f64), INTEL_CPU_CVT(u64, boolean), \ + INTEL_CPU_CVT(i64, u8), INTEL_CPU_CVT(i64, i8), INTEL_CPU_CVT(i64, u16), INTEL_CPU_CVT(i64, i16), \ + INTEL_CPU_CVT(i64, u32), INTEL_CPU_CVT(i64, i32), INTEL_CPU_CVT(i64, u64), INTEL_CPU_CVT(i64, f32), \ + INTEL_CPU_CVT(i64, f16), INTEL_CPU_CVT(i64, bf16), INTEL_CPU_CVT(i64, f64), INTEL_CPU_CVT(i64, boolean), \ + INTEL_CPU_CVT(f32, u8), INTEL_CPU_CVT(f32, i8), INTEL_CPU_CVT(f32, u16), INTEL_CPU_CVT(f32, i16), \ + INTEL_CPU_CVT(f32, u32), INTEL_CPU_CVT(f32, i32), INTEL_CPU_CVT(f32, u64), INTEL_CPU_CVT(f32, i64), \ + INTEL_CPU_CVT(f32, f16), INTEL_CPU_CVT(f32, bf16), INTEL_CPU_CVT(f32, f64), INTEL_CPU_CVT(f32, boolean), \ + INTEL_CPU_CVT(f16, u8), INTEL_CPU_CVT(f16, i8), INTEL_CPU_CVT(f16, u16), INTEL_CPU_CVT(f16, i16), \ + INTEL_CPU_CVT(f16, u32), INTEL_CPU_CVT(f16, i32), INTEL_CPU_CVT(f16, u64), INTEL_CPU_CVT(f16, i64), \ + INTEL_CPU_CVT(f16, f32), INTEL_CPU_CVT(f16, bf16), INTEL_CPU_CVT(f16, f64), INTEL_CPU_CVT(f16, boolean), \ + INTEL_CPU_CVT(bf16, u8), INTEL_CPU_CVT(bf16, i8), INTEL_CPU_CVT(bf16, u16), INTEL_CPU_CVT(bf16, i16), \ + INTEL_CPU_CVT(bf16, u32), INTEL_CPU_CVT(bf16, i32), INTEL_CPU_CVT(bf16, u64), INTEL_CPU_CVT(bf16, i64), \ + INTEL_CPU_CVT(bf16, f32), INTEL_CPU_CVT(bf16, f16), INTEL_CPU_CVT(bf16, f64), INTEL_CPU_CVT(bf16, boolean), \ + INTEL_CPU_CVT(f64, u8), INTEL_CPU_CVT(f64, i8), INTEL_CPU_CVT(f64, u16), INTEL_CPU_CVT(f64, i16), \ + INTEL_CPU_CVT(f64, u32), INTEL_CPU_CVT(f64, i32), INTEL_CPU_CVT(f64, u64), INTEL_CPU_CVT(f64, i64), \ + INTEL_CPU_CVT(f64, f32), INTEL_CPU_CVT(f64, f16), INTEL_CPU_CVT(f64, bf16), INTEL_CPU_CVT(f64, boolean), \ + INTEL_CPU_CVT(boolean, u8), INTEL_CPU_CVT(boolean, i8), INTEL_CPU_CVT(boolean, u16), \ + INTEL_CPU_CVT(boolean, i16), INTEL_CPU_CVT(boolean, u32), INTEL_CPU_CVT(boolean, i32), \ + INTEL_CPU_CVT(boolean, u64), INTEL_CPU_CVT(boolean, i64), INTEL_CPU_CVT(boolean, f32), \ + INTEL_CPU_CVT(boolean, f16), INTEL_CPU_CVT(boolean, bf16), INTEL_CPU_CVT(boolean, f64), INTEL_CPU_CVT(u8, u8), \ + INTEL_CPU_CVT(i8, i8), INTEL_CPU_CVT(u16, u16), INTEL_CPU_CVT(i16, i16), INTEL_CPU_CVT(u32, u32), \ + INTEL_CPU_CVT(i32, i32), INTEL_CPU_CVT(u64, u64), INTEL_CPU_CVT(i64, i64), INTEL_CPU_CVT(f32, f32), \ + INTEL_CPU_CVT(f16, f16), INTEL_CPU_CVT(bf16, bf16), INTEL_CPU_CVT(f64, f64), INTEL_CPU_CVT(boolean, boolean) + +#define INTEL_CPU_CVT_FROM_BIN_LIST \ + INTEL_CPU_CVT(u1, f32), INTEL_CPU_CVT(u1, f16), INTEL_CPU_CVT(u1, bf16), INTEL_CPU_CVT(u1, f64), \ + INTEL_CPU_CVT(u1, i16), INTEL_CPU_CVT(u1, u8), INTEL_CPU_CVT(u1, i8), INTEL_CPU_CVT(u1, u16), \ + INTEL_CPU_CVT(u1, i32), INTEL_CPU_CVT(u1, u32), INTEL_CPU_CVT(u1, i64), INTEL_CPU_CVT(u1, u64), \ + INTEL_CPU_CVT(u1, boolean) struct ConvertFromBinContext { - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; bool converted; }; -template +template struct ConvertFromBinPrecision; -template +template struct ConvertFromBinPrecision> { - void operator()(ConvertFromBinContext &ctx) { - auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + void operator()(ConvertFromBinContext& ctx) { + auto src = static_cast(ctx.srcPtr); + auto dst = static_cast(ctx.dstPtr); const size_t nBits = 8; const size_t nBytes = rnd_up(ctx.size, nBits); parallel_for(nBytes, [&](size_t byteIndex) { @@ -590,16 +570,17 @@ struct ConvertFromBinPrecision> { } }; -#define INTEL_CPU_CVT_FROM_4BIT_LIST \ - INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16), INTEL_CPU_CVT(u4, i8), INTEL_CPU_CVT(u4, u8), \ - INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16), INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8), \ - INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16), INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8), INTEL_CPU_CVT(nf4, u8), \ - INTEL_CPU_CVT(f4e2m1, f32), INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8) +#define INTEL_CPU_CVT_FROM_4BIT_LIST \ + INTEL_CPU_CVT(u4, f32), INTEL_CPU_CVT(u4, bf16), INTEL_CPU_CVT(u4, f16), INTEL_CPU_CVT(u4, i8), \ + INTEL_CPU_CVT(u4, u8), INTEL_CPU_CVT(i4, f32), INTEL_CPU_CVT(i4, bf16), INTEL_CPU_CVT(i4, f16), \ + INTEL_CPU_CVT(i4, i8), INTEL_CPU_CVT(i4, u8), INTEL_CPU_CVT(nf4, f32), INTEL_CPU_CVT(nf4, bf16), \ + INTEL_CPU_CVT(nf4, f16), INTEL_CPU_CVT(nf4, i8), INTEL_CPU_CVT(nf4, u8), INTEL_CPU_CVT(f4e2m1, f32), \ + INTEL_CPU_CVT(f4e2m1, bf16), INTEL_CPU_CVT(f4e2m1, f16), INTEL_CPU_CVT(f4e2m1, i8), INTEL_CPU_CVT(f4e2m1, u8) struct ConvertFrom4BitContext { ov::element::Type_t inType; - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; bool converted; }; @@ -624,12 +605,12 @@ static int8_t get_u4(const uint8_t& val, bool high) { return high ? (val >> 4) : (val & 0xF); } -template +template struct ConvertFrom4BitPrecision; -template +template struct ConvertFrom4BitPrecision> { - void operator()(ConvertFrom4BitContext &ctx) { + void operator()(ConvertFrom4BitContext& ctx) { auto src = static_cast(ctx.srcPtr); auto dst = static_cast(ctx.dstPtr); if (ctx.inType == ov::element::nf4) { @@ -655,23 +636,23 @@ struct ConvertFrom4BitPrecision> { } }; -#define INTEL_CPU_CVT_FROM_BYTE_FP_LIST \ +#define INTEL_CPU_CVT_FROM_BYTE_FP_LIST \ INTEL_CPU_CVT(f8e8m0, f32), INTEL_CPU_CVT(f8e8m0, bf16), INTEL_CPU_CVT(f8e8m0, f16) struct ConvertFromByteFPContext { ov::element::Type_t inType; - const void *srcPtr; - void *dstPtr; + const void* srcPtr; + void* dstPtr; size_t size; bool converted; }; -template +template struct ConvertFromByteFPPrecision; -template +template struct ConvertFromByteFPPrecision> { - void operator()(ConvertFromByteFPContext &ctx) { + void operator()(ConvertFromByteFPContext& ctx) { auto src = static_cast(ctx.srcPtr); auto dst = static_cast(ctx.dstPtr); if (ctx.inType == ov::element::f8e8m0) { @@ -685,12 +666,16 @@ struct ConvertFromByteFPPrecision> { } }; -void cpu_convert(const void *srcPtr, void *dstPtr, ov::element::Type srcPrc, ov::element::Type dstPrc, const size_t size) { +void cpu_convert(const void* srcPtr, + void* dstPtr, + ov::element::Type srcPrc, + ov::element::Type dstPrc, + const size_t size) { cpu_convert(srcPtr, dstPtr, srcPrc, dstPrc, dstPrc, size); } -void cpu_convert(const void *srcPtr, - void *dstPtr, +void cpu_convert(const void* srcPtr, + void* dstPtr, ov::element::Type srcPrc, ov::element::Type interimPrc, ov::element::Type dstPrc, @@ -705,12 +690,12 @@ void cpu_convert(const void *srcPtr, const size_t L2_cache_size = dnnl::utils::get_cache_size(2, true); const size_t totalSize = size * dstPrc.size(); if (srcPrc == element::string) { - auto str_src = reinterpret_cast(srcPtr); - auto str_dst = reinterpret_cast(dstPtr); + auto str_src = reinterpret_cast(srcPtr); + auto str_dst = reinterpret_cast(dstPtr); std::copy(str_src, str_src + size, str_dst); } else if (totalSize >= L2_cache_size) { - auto src = static_cast(srcPtr); - auto dst = static_cast(dstPtr); + auto src = static_cast(srcPtr); + auto dst = static_cast(dstPtr); parallel_nt(0, [&](const size_t ithr, const size_t nthr) { size_t start = 0, end = 0; splitter(totalSize, nthr, ithr, start, end); @@ -728,12 +713,7 @@ void cpu_convert(const void *srcPtr, "> precision to: ", dstPrc, ". Not implemented."); - ConvertFromBinContext ctx { - srcPtr, - dstPtr, - size, - false - }; + ConvertFromBinContext ctx{srcPtr, dstPtr, size, false}; OV_SWITCH(intel_cpu, ConvertFromBinPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BIN_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", @@ -749,18 +729,15 @@ void cpu_convert(const void *srcPtr, OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); } else if (srcPrc.bitwidth() == 8u && srcPrc.is_real()) { ConvertFromByteFPContext ctx{srcPrc, srcPtr, dstPtr, size, false}; - OV_SWITCH(intel_cpu, ConvertFromByteFPPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BYTE_FP_LIST); + OV_SWITCH(intel_cpu, + ConvertFromByteFPPrecision, + ctx, + std::tie(srcPrc, dstPrc), + INTEL_CPU_CVT_FROM_BYTE_FP_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); } else { - ConvertContext ctx { - srcPtr, - dstPtr, - size, - interimPrc, - dstPrc, - false - }; + ConvertContext ctx{srcPtr, dstPtr, size, interimPrc, dstPrc, false}; OV_SWITCH(intel_cpu, ConvertPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_LIST); if (!ctx.converted) OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc); @@ -773,7 +750,7 @@ struct isSupportedContext { template struct isSupported { - void operator()(isSupportedContext &ctx) { + void operator()(isSupportedContext& ctx) { ctx.isSupported = true; } }; @@ -790,5 +767,5 @@ bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc) { #undef INTEL_CPU_CVT #undef INTEL_CPU_CVT_LIST -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h index 8390849ff8adc7..11228dbd1dcfdb 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.h @@ -22,8 +22,8 @@ namespace intel_cpu { * number of elements in buffers to be converted * @return none. */ -void cpu_convert(const void *srcPtr, - void *dstPtr, +void cpu_convert(const void* srcPtr, + void* dstPtr, ov::element::Type srcPrc, ov::element::Type dstPrc, const size_t size); @@ -45,14 +45,14 @@ void cpu_convert(const void *srcPtr, * number of elements in buffers to be converted * @return none. */ -void cpu_convert(const void *srcPtr, - void *dstPtr, +void cpu_convert(const void* srcPtr, + void* dstPtr, ov::element::Type srcPrc, ov::element::Type interimPrc, ov::element::Type dstPrc, const size_t size); - bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc); +bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc); -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h b/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h old mode 100755 new mode 100644 index 95b0267bd4757c..e827d35a11c2ad --- a/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_memcpy.h @@ -5,8 +5,9 @@ #pragma once #include -#include "openvino/core/parallel.hpp" + #include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -36,8 +37,7 @@ inline void cpu_memcpy(void* dst, const void* src, size_t count) { } inline int cpu_memcpy_s(void* dst, size_t dst_size, const void* src, size_t count) { - if (!src || - count > dst_size || + if (!src || count > dst_size || count > (dst > src ? ((uintptr_t)dst - (uintptr_t)src) : ((uintptr_t)src - (uintptr_t)dst))) { // zero out dest if error detected std::memset(dst, 0, dst_size); @@ -55,8 +55,8 @@ inline int cpu_memcpy_s(void* dst, size_t dst_size, const void* src, size_t coun inline void cpu_parallel_memcpy(void* dst, const void* src, size_t count) { const size_t l2_cache_size = dnnl::utils::get_cache_size(2, true); if (count >= l2_cache_size) { - auto src_int8 = static_cast(src); - auto dst_int8 = static_cast(dst); + auto src_int8 = static_cast(src); + auto dst_int8 = static_cast(dst); parallel_nt(0, [&](const size_t ithr, const size_t nthr) { size_t start = 0, end = 0; splitter(count, nthr, ithr, start, end); @@ -67,5 +67,5 @@ inline void cpu_parallel_memcpy(void* dst, const void* src, size_t count) { } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/defs.h b/src/plugins/intel_cpu/src/nodes/common/defs.h index 6d8574de0939a4..a8a07a2cc8942a 100644 --- a/src/plugins/intel_cpu/src/nodes/common/defs.h +++ b/src/plugins/intel_cpu/src/nodes/common/defs.h @@ -4,10 +4,10 @@ #pragma once -#if defined (HAVE_SSE) || defined (HAVE_AVX2) -# if defined (_WIN32) -# include -# else -# include -# endif +#if defined(HAVE_SSE) || defined(HAVE_AVX2) +# if defined(_WIN32) +# include +# else +# include +# endif #endif diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp index 51aa54c2f50463..695fdbe823ea15 100644 --- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.cpp @@ -18,7 +18,9 @@ DnnlExecutor::DnnlExecutor(const dnnl::primitive_desc& pd) { DnnlExecutor::IntermReorder::IntermReorder(const dnnl::memory::desc& descSrc, const dnnl::memory::desc& descDst, - const dnnl::engine& engine) : m_descSrc(descSrc), m_descDst(descDst) { + const dnnl::engine& engine) + : m_descSrc(descSrc), + m_descDst(descDst) { auto reorderPd = dnnl::reorder::primitive_desc(engine, descSrc, engine, descDst); m_reorder = dnnl::reorder(reorderPd); } @@ -36,7 +38,7 @@ void DnnlExecutor::exec(const std::unordered_map& primArgs, d } void DnnlExecutor::reorder_exec(std::unordered_map primArgs, dnnl::stream strm) { - for (auto &inReorder : inputReorders) { + for (auto& inReorder : inputReorders) { if (primArgs.count(inReorder.first)) { dnnl::memory memDst(inReorder.second.getDstDesc(), strm.get_engine()); inReorder.second.exec(primArgs[inReorder.first], memDst, strm); @@ -46,17 +48,19 @@ void DnnlExecutor::reorder_exec(std::unordered_map primArgs, } } std::unordered_map outputMem; - for (auto &outReorder : outputReorders) { + for (auto& outReorder : outputReorders) { if (primArgs.count(outReorder.first)) { dnnl::memory memSrc(outReorder.second.getSrcDesc(), strm.get_engine()); outputMem[outReorder.first] = primArgs[outReorder.first]; primArgs[outReorder.first] = memSrc; } else { - OPENVINO_THROW("DnnlExecutor has reorder for output ", outReorder.first, ", but doesn't have destination memory"); + OPENVINO_THROW("DnnlExecutor has reorder for output ", + outReorder.first, + ", but doesn't have destination memory"); } } execPrim.execute(strm, primArgs); - for (auto &outReorder : outputReorders) { + for (auto& outReorder : outputReorders) { outReorder.second.exec(primArgs[outReorder.first], outputMem[outReorder.first], strm); } } @@ -79,4 +83,4 @@ impl_desc_type DnnlExecutor::getImplementationType() const { } } // namespace intel_cpu -} // namespace ov +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h index 3cc6749857816c..32739a38d37028 100644 --- a/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h +++ b/src/plugins/intel_cpu/src/nodes/common/dnnl_executor.h @@ -6,74 +6,79 @@ #include #include + #include "memory_desc/dnnl_memory_desc.h" namespace ov { namespace intel_cpu { class DnnlExecutor { - protected: - class IntermReorder { - public: - IntermReorder(const dnnl::memory::desc& descSrc, const dnnl::memory::desc& descDst, const dnnl::engine& engine); - void exec(dnnl::memory& memSrc, dnnl::memory& memDst, dnnl::stream strm); - const dnnl::memory::desc& getSrcDesc() const { return m_descSrc; } - const dnnl::memory::desc& getDstDesc() const { return m_descDst; } - - private: - dnnl::reorder m_reorder; - dnnl::memory::desc m_descSrc; - dnnl::memory::desc m_descDst; - }; - +protected: + class IntermReorder { public: - explicit DnnlExecutor(const dnnl::primitive_desc& pd); - void exec(const std::unordered_map& primArgs, dnnl::stream strm); - bool needReordering() const; - virtual ~DnnlExecutor() = default; - dnnl::primitive getExecPrim() const; - const_dnnl_primitive_desc_t getPrimitiveDesc() const; - impl_desc_type getImplementationType() const; - - DnnlMemoryDescPtr getSrcDesc() const { - return src_md; + IntermReorder(const dnnl::memory::desc& descSrc, const dnnl::memory::desc& descDst, const dnnl::engine& engine); + void exec(dnnl::memory& memSrc, dnnl::memory& memDst, dnnl::stream strm); + const dnnl::memory::desc& getSrcDesc() const { + return m_descSrc; } - DnnlMemoryDescPtr getWeightDesc() const { - return wghts_md; - } - DnnlMemoryDescPtr getDstDesc() const { - return dst_md; - } - DnnlMemoryDescPtr getScratchPadDesc() const { - return scrch_md; + const dnnl::memory::desc& getDstDesc() const { + return m_descDst; } - const dnnl::memory::desc& getDnnlSrcDesc() const { - return src_md->getDnnlDesc(); - } - const dnnl::memory::desc& getDnnlWeightDesc() const { - return wghts_md->getDnnlDesc(); - } - const dnnl::memory::desc& getDnnlDstDesc() const { - return dst_md->getDnnlDesc(); - } - const dnnl::memory::desc& getDnnlScratchPadDesc() const { - return scrch_md->getDnnlDesc(); - } + private: + dnnl::reorder m_reorder; + dnnl::memory::desc m_descSrc; + dnnl::memory::desc m_descDst; + }; + +public: + explicit DnnlExecutor(const dnnl::primitive_desc& pd); + void exec(const std::unordered_map& primArgs, dnnl::stream strm); + bool needReordering() const; + virtual ~DnnlExecutor() = default; + dnnl::primitive getExecPrim() const; + const_dnnl_primitive_desc_t getPrimitiveDesc() const; + impl_desc_type getImplementationType() const; + + DnnlMemoryDescPtr getSrcDesc() const { + return src_md; + } + DnnlMemoryDescPtr getWeightDesc() const { + return wghts_md; + } + DnnlMemoryDescPtr getDstDesc() const { + return dst_md; + } + DnnlMemoryDescPtr getScratchPadDesc() const { + return scrch_md; + } + + const dnnl::memory::desc& getDnnlSrcDesc() const { + return src_md->getDnnlDesc(); + } + const dnnl::memory::desc& getDnnlWeightDesc() const { + return wghts_md->getDnnlDesc(); + } + const dnnl::memory::desc& getDnnlDstDesc() const { + return dst_md->getDnnlDesc(); + } + const dnnl::memory::desc& getDnnlScratchPadDesc() const { + return scrch_md->getDnnlDesc(); + } - protected: - virtual void reorder_exec(std::unordered_map primArgs, dnnl::stream strm); +protected: + virtual void reorder_exec(std::unordered_map primArgs, dnnl::stream strm); - protected: - dnnl::primitive execPrim; - // key is the port number for the primitive that needs memory reordering - std::unordered_map inputReorders; - std::unordered_map outputReorders; - DnnlMemoryDescPtr src_md; - DnnlMemoryDescPtr wghts_md; - DnnlMemoryDescPtr dst_md; - DnnlMemoryDescPtr scrch_md; +protected: + dnnl::primitive execPrim; + // key is the port number for the primitive that needs memory reordering + std::unordered_map inputReorders; + std::unordered_map outputReorders; + DnnlMemoryDescPtr src_md; + DnnlMemoryDescPtr wghts_md; + DnnlMemoryDescPtr dst_md; + DnnlMemoryDescPtr scrch_md; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h b/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h index daedcc4bf23ca4..b6622f7ae54d0b 100644 --- a/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h +++ b/src/plugins/intel_cpu/src/nodes/common/fp16_utils.h @@ -13,7 +13,7 @@ typedef short ie_fp16; // F32: exp_bias:127 SEEEEEEE EMMMMMMM MMMMMMMM MMMMMMMM. // F16: exp_bias:15 SEEEEEMM MMMMMMMM #define EXP_MASK_F32 0x7F800000U -#define EXP_MASK_F16 0x7C00U +#define EXP_MASK_F16 0x7C00U // small helper function to represent uint32_t value as float32 inline float asfloat(uint32_t v) { @@ -83,5 +83,5 @@ inline float f16tof32(ie_fp16 x) { return asfloat(u); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp index 396cebc1ba82e1..60bd675d726e4a 100644 --- a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.cpp @@ -6,15 +6,14 @@ #include -#include "dnnl_types.h" -#include "dnnl_extension_utils.h" -#include "cpu_memcpy.h" -#include "utils/bfloat16.hpp" - -#include "cpu/x64/jit_generator.hpp" #include "common/primitive_hashing_utils.hpp" -#include "nodes/executors/transpose.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "cpu_memcpy.h" +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" #include "nodes/executors/common/ref_transpose.hpp" +#include "nodes/executors/transpose.hpp" +#include "utils/bfloat16.hpp" using namespace dnnl; using namespace dnnl::impl; @@ -33,7 +32,9 @@ template struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_permute_kernel_f32) - explicit jit_uni_permute_kernel_f32(jit_permute_config_params jcp_) : jit_uni_permute_kernel(jcp_), jit_generator(jit_name()) {} + explicit jit_uni_permute_kernel_f32(jit_permute_config_params jcp_) + : jit_uni_permute_kernel(jcp_), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -51,23 +52,43 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge this->postamble(); } - void load(const Xbyak::Xmm &xmm, const Xbyak::Address &addr) { + void load(const Xbyak::Xmm& xmm, const Xbyak::Address& addr) { switch (jcp.data_size) { - case 16: uni_vmovups(xmm, addr); break; - case 8: uni_vmovsd(xmm, addr); break; - case 4: uni_vmovss(xmm, addr); break; - case 2: uni_vpinsrw(xmm, xmm, addr, 0x0); break; - case 1: uni_vpinsrb(xmm, xmm, addr, 0x0); break; + case 16: + uni_vmovups(xmm, addr); + break; + case 8: + uni_vmovsd(xmm, addr); + break; + case 4: + uni_vmovss(xmm, addr); + break; + case 2: + uni_vpinsrw(xmm, xmm, addr, 0x0); + break; + case 1: + uni_vpinsrb(xmm, xmm, addr, 0x0); + break; } } - void store(const Xbyak::Address &addr, const Xbyak::Xmm &xmm) { + void store(const Xbyak::Address& addr, const Xbyak::Xmm& xmm) { switch (jcp.data_size) { - case 16: uni_vmovups(addr, xmm); break; - case 8: uni_vmovsd(addr, xmm); break; - case 4: uni_vmovss(addr, xmm); break; - case 2: uni_vpextrw(addr, xmm, 0x0); break; - case 1: uni_vpextrb(addr, xmm, 0x0); break; + case 16: + uni_vmovups(addr, xmm); + break; + case 8: + uni_vmovsd(addr, xmm); + break; + case 4: + uni_vmovss(addr, xmm); + break; + case 2: + uni_vpextrw(addr, xmm, 0x0); + break; + case 1: + uni_vpextrb(addr, xmm, 0x0); + break; } } @@ -99,7 +120,8 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge } } - L(tail_loop_label); { + L(tail_loop_label); + { cmp(reg_work_amount, 0); je(exit_label, T_NEAR); @@ -129,7 +151,8 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; uint32_t vlen = cpu_isa_traits::vlen; Xbyak::Reg64 reg_src = r8; @@ -144,7 +167,7 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge Xbyak::Xmm xmm = Xbyak::Xmm(1); }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 PermuteKernel::PermuteKernel(const PermuteParams& params) : params(params) { jcp = TransposeExecutor::prepareParams(params); @@ -156,7 +179,7 @@ PermuteKernel::PermuteKernel(const PermuteParams& params) : params(params) { } else if (mayiuse(cpu::x64::sse41)) { permute_kernel.reset(new jit_uni_permute_kernel_f32(jcp)); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 if (permute_kernel) permute_kernel->create_ker(); @@ -178,7 +201,7 @@ void PermuteKernel::execute(const uint8_t* src_data, uint8_t* dst_data) { return; } - RefTransposeExecutor::referenceExecute(src_data, dst_data, jcp, dst_dims[0]); + RefTransposeExecutor::referenceExecute(src_data, dst_data, jcp, dst_dims[0]); } void PermuteKernel::optimizedExecute(const uint8_t* src_data, uint8_t* dst_data, const int mb) { @@ -190,42 +213,42 @@ void PermuteKernel::optimizedExecute(const uint8_t* src_data, uint8_t* dst_data, dst_dims[0] = mb; switch (jcp.n) { - case 1: - parallel_for(dst_dims[0], [&](int i0) { - auto arg = jit_args_permute(); - - size_t dst_off = i0 * dst_strides[0]; - size_t src_off = i0 * src_strides[0]; - arg.src = &src_data[src_off * jcp.data_size]; - arg.dst = &dst_data[dst_off * jcp.data_size]; - - (*permute_kernel)(&arg); - }); - break; - case 2: - parallel_for2d(dst_dims[0], dst_dims[1], [&](int i0, int i1) { - auto arg = jit_args_permute(); - - size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1]; - size_t src_off = i0 * src_strides[0] + i1 * src_strides[1]; - arg.src = &src_data[src_off * jcp.data_size]; - arg.dst = &dst_data[dst_off * jcp.data_size]; - - (*permute_kernel)(&arg); - }); - break; - case 3: - parallel_for3d(dst_dims[0], dst_dims[1], dst_dims[2], [&](int i0, int i1, int i2) { - auto arg = jit_args_permute(); - - size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1] + i2 * dst_strides[2]; - size_t src_off = i0 * src_strides[0] + i1 * src_strides[1] + i2 * src_strides[2]; - arg.src = &src_data[src_off * jcp.data_size]; - arg.dst = &dst_data[dst_off * jcp.data_size]; - - (*permute_kernel)(&arg); - }); - break; + case 1: + parallel_for(dst_dims[0], [&](int i0) { + auto arg = jit_args_permute(); + + size_t dst_off = i0 * dst_strides[0]; + size_t src_off = i0 * src_strides[0]; + arg.src = &src_data[src_off * jcp.data_size]; + arg.dst = &dst_data[dst_off * jcp.data_size]; + + (*permute_kernel)(&arg); + }); + break; + case 2: + parallel_for2d(dst_dims[0], dst_dims[1], [&](int i0, int i1) { + auto arg = jit_args_permute(); + + size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1]; + size_t src_off = i0 * src_strides[0] + i1 * src_strides[1]; + arg.src = &src_data[src_off * jcp.data_size]; + arg.dst = &dst_data[dst_off * jcp.data_size]; + + (*permute_kernel)(&arg); + }); + break; + case 3: + parallel_for3d(dst_dims[0], dst_dims[1], dst_dims[2], [&](int i0, int i1, int i2) { + auto arg = jit_args_permute(); + + size_t dst_off = i0 * dst_strides[0] + i1 * dst_strides[1] + i2 * dst_strides[2]; + size_t src_off = i0 * src_strides[0] + i1 * src_strides[1] + i2 * src_strides[2]; + arg.src = &src_data[src_off * jcp.data_size]; + arg.dst = &dst_data[dst_off * jcp.data_size]; + + (*permute_kernel)(&arg); + }); + break; } return; } @@ -245,12 +268,10 @@ size_t PermuteParams::hash() const { } bool PermuteParams::operator==(const PermuteParams& rhs) const { - return (src_block_dims == rhs.src_block_dims) && - (dst_block_dims == rhs.dst_block_dims) && - (src_block_order == rhs.src_block_order) && - (dst_block_order == rhs.dst_block_order) && (order == rhs.order) && - (data_size == rhs.data_size); + return (src_block_dims == rhs.src_block_dims) && (dst_block_dims == rhs.dst_block_dims) && + (src_block_order == rhs.src_block_order) && (dst_block_order == rhs.dst_block_order) && + (order == rhs.order) && (data_size == rhs.data_size); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h index ac665efb4f0bb6..ba7a89d746d945 100644 --- a/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h +++ b/src/plugins/intel_cpu/src/nodes/common/permute_kernel.h @@ -38,9 +38,9 @@ struct jit_args_permute { }; struct jit_uni_permute_kernel { - void (*ker_)(const jit_args_permute *); + void (*ker_)(const jit_args_permute*); - void operator()(const jit_args_permute *args) { + void operator()(const jit_args_permute* args) { assert(ker_); ker_(args); } @@ -71,5 +71,5 @@ class PermuteKernel { PermuteParams params; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp b/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp index 93e145b25b9e95..dd07a721260aac 100644 --- a/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/reorder_prim.cpp @@ -4,15 +4,14 @@ #include "reorder_prim.h" -#include "dnnl_extension_utils.h" -#include "dnnl_types.h" - #include -#include "common/primitive_hashing_utils.hpp" -#include "cpu/x64/cpu_isa_traits.hpp" #include #include +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/cpu_isa_traits.hpp" +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" #include "utils/general_utils.h" namespace ov { diff --git a/src/plugins/intel_cpu/src/nodes/common/softmax.cpp b/src/plugins/intel_cpu/src/nodes/common/softmax.cpp index 66a6ca9c1b6f53..0fcc87f8978752 100644 --- a/src/plugins/intel_cpu/src/nodes/common/softmax.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/softmax.cpp @@ -4,17 +4,17 @@ #include "softmax.h" -#include "openvino/core/parallel.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" -#include "onednn/dnnl.h" -#include "utils/bfloat16.hpp" -#include "emitters/plugin/x64/jit_bf16_emitters.hpp" - #include #include #include +#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "emitters/plugin/x64/jit_bf16_emitters.hpp" +#include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" +#include "utils/bfloat16.hpp" + using namespace dnnl; using namespace dnnl::impl::cpu; using namespace dnnl::impl::cpu::x64; @@ -38,11 +38,13 @@ struct jit_softmax_config_params { ov::element::Type dst_dt; }; - struct jit_uni_softmax_kernel { - void (*ker_)(const jit_args_softmax *); + void (*ker_)(const jit_args_softmax*); - void operator()(const jit_args_softmax *args) { assert(ker_); ker_(args); } + void operator()(const jit_args_softmax* args) { + assert(ker_); + ker_(args); + } jit_uni_softmax_kernel() : ker_(nullptr) {} virtual ~jit_uni_softmax_kernel() {} @@ -54,7 +56,10 @@ template struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32) - jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator(jit_name()), jcp_(jcp) {} + jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) + : jit_uni_softmax_kernel(), + jit_generator(jit_name()), + jcp_(jcp) {} void create_ker() override { jit_generator::create_kernel(); @@ -62,14 +67,14 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge } void generate() override { - exp_injector.reset(new jit_uni_eltwise_injector_f32(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f)); + exp_injector.reset( + new jit_uni_eltwise_injector_f32(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f)); if (mayiuse(avx512_core)) uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa)); this->preamble(); - mov(reg_src, ptr[reg_params + GET_OFF(src)]); mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]); @@ -86,7 +91,8 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge mov(aux_reg_work_amount, reg_work_amount); mov(aux_reg_src, reg_src); load_vector(vmm_max, ptr[aux_reg_src], jcp_.src_dt); - L(max_loop_label); { + L(max_loop_label); + { cmp(aux_reg_work_amount, 0); jle(max_loop_end_label, T_NEAR); @@ -120,7 +126,8 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge mov(aux_reg_src, reg_src); mov(aux_reg_dst, reg_dst); uni_vpxor(vmm_exp_sum, vmm_exp_sum, vmm_exp_sum); - L(exp_loop_label); { + L(exp_loop_label); + { cmp(aux_reg_work_amount, 0); jle(exp_loop_end_label, T_NEAR); @@ -143,7 +150,8 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge mov(aux_reg_work_amount, reg_work_amount); mov(aux_reg_dst, reg_dst); - L(div_loop_label); { + L(div_loop_label); + { cmp(aux_reg_work_amount, 0); jle(div_loop_end_label, T_NEAR); @@ -196,38 +204,40 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge jit_softmax_config_params jcp_; - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, ov::element::Type src_dt) { + inline void load_vector(Vmm vmm_src, const Xbyak::Address& op, ov::element::Type src_dt) { switch (src_dt) { - case ov::element::f32: - uni_vmovups(vmm_src, op); - break; - case ov::element::bf16: - vpmovzxwd(vmm_src, op); - uni_vpslld(vmm_src, vmm_src, 16); - break; - default: - assert(!"unknown src_dt"); + case ov::element::f32: + uni_vmovups(vmm_src, op); + break; + case ov::element::bf16: + vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); + break; + default: + assert(!"unknown src_dt"); } } - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, ov::element::Type dst_dt) { + inline void store_vector(const Xbyak::Address& op, Vmm vmm_dst, ov::element::Type dst_dt) { Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx()); switch (dst_dt) { - case ov::element::f32: - uni_vmovups(op, vmm_dst); - break; - case ov::element::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); - break; - default: - assert(!"unknown dst_dt"); + case ov::element::f32: + uni_vmovups(op, vmm_dst); + break; + case ov::element::bf16: + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + break; + default: + assert(!"unknown dst_dt"); } } }; #endif SoftmaxGeneric::SoftmaxGeneric(ov::element::Type inpPrc, ov::element::Type outPrc) - : input_prec(inpPrc), output_prec(outPrc) { + : input_prec(inpPrc), + output_prec(outPrc) { if (ov::element::bf16 == output_prec) { if (!mayiuse(avx512_core)) { OPENVINO_THROW("SoftmaxGeneric doesn't support BF16 precision on this target."); @@ -255,27 +265,27 @@ SoftmaxGeneric::SoftmaxGeneric(ov::element::Type inpPrc, ov::element::Type outPr #endif } -template -void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, int B, int C, int H, int W) { +template +void SoftmaxGeneric::calculate(const in_data_t* src_data, out_data_t* dst_data, int B, int C, int H, int W) { for (int b = 0; b < B; b++) { int tail_start = 0; if (softmax_kernel) { - int blocks_num = H*W / block_size; + int blocks_num = H * W / block_size; parallel_for(blocks_num, [&](int ib) { auto arg = jit_args_softmax(); arg.src = src_data + b * C * H * W + ib * block_size; arg.dst = dst_data + b * C * H * W + ib * block_size; - arg.src_stride = static_cast((size_t)(H) * W * sizeof(in_data_t)); - arg.dst_stride = static_cast((size_t)(H) * W * sizeof(out_data_t)); + arg.src_stride = static_cast((size_t)(H)*W * sizeof(in_data_t)); + arg.dst_stride = static_cast((size_t)(H)*W * sizeof(out_data_t)); arg.work_amount = static_cast(C); (*softmax_kernel)(&arg); }); - tail_start = (H*W / block_size) * block_size; + tail_start = (H * W / block_size) * block_size; } parallel_for(H * W - tail_start, [&](int i) { @@ -283,7 +293,8 @@ void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, float max = src_data[b * C * H * W + offset]; for (int c = 0; c < C; c++) { float val = src_data[b * C * H * W + c * H * W + offset]; - if (val > max) max = val; + if (val > max) + max = val; } float expSum = 0; @@ -299,7 +310,7 @@ void SoftmaxGeneric::calculate(const in_data_t *src_data, out_data_t *dst_data, } } -void SoftmaxGeneric::execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W) { +void SoftmaxGeneric::execute(const uint8_t* src_data, uint8_t* dst_data, int B, int C, int H, int W) { if (ov::element::f32 == input_prec) { auto float_src_data = reinterpret_cast(src_data); if (ov::element::f32 == output_prec) { @@ -327,5 +338,5 @@ void SoftmaxGeneric::execute(const uint8_t *src_data, uint8_t *dst_data, int B, } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/softmax.h b/src/plugins/intel_cpu/src/nodes/common/softmax.h index 2e3d5caa4becee..bb450c2ac5a303 100644 --- a/src/plugins/intel_cpu/src/nodes/common/softmax.h +++ b/src/plugins/intel_cpu/src/nodes/common/softmax.h @@ -4,27 +4,28 @@ #pragma once -#include #include -#include "openvino/core/type/element_type.hpp" +#include + #include "defs.h" #include "openvino/core/parallel.hpp" +#include "openvino/core/type/element_type.hpp" namespace ov { namespace intel_cpu { struct jit_uni_softmax_kernel; -static inline -void softmax_many_batches(const float *src_data, float *dst_data, int B, int C, int H, int W) { +static inline void softmax_many_batches(const float* src_data, float* dst_data, int B, int C, int H, int W) { ov::parallel_for(B * H * W, [&](size_t i) { - const float *psrc = src_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; - float *pdst = dst_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; + const float* psrc = src_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; + float* pdst = dst_data + (i / (H * W)) * C * H * W - (i / (H * W)) * H * W; float max = psrc[i]; for (int c = 0; c < C; c++) { float val = psrc[c * H * W + i]; - if (val > max) max = val; + if (val > max) + max = val; } float expSum = 0; @@ -43,9 +44,10 @@ class SoftmaxGeneric { public: SoftmaxGeneric(ov::element::Type inpPrc, ov::element::Type outPrc); - void execute(const uint8_t *src_data, uint8_t *dst_data, int B, int C, int H, int W); + void execute(const uint8_t* src_data, uint8_t* dst_data, int B, int C, int H, int W); + private: - template + template void calculate(const in_data_t* src_data, out_data_t* dst_data, int B, int C, int H, int W); private: @@ -54,5 +56,5 @@ class SoftmaxGeneric { std::shared_ptr softmax_kernel; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp index 6c62304ab22da7..f482b0876b3f4c 100644 --- a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp @@ -4,18 +4,17 @@ #include "tile_broadcast_utils.h" +#include + #include "cpu_convert.h" #include "cpu_memcpy.h" -#include "openvino/core/parallel.hpp" -#include #include "memory_desc/dnnl_blocked_memory_desc.h" - - +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { -VectorDims TileBroadcastCommon::calculateDenseStrides(const VectorDims &dims) { +VectorDims TileBroadcastCommon::calculateDenseStrides(const VectorDims& dims) { VectorDims strides(dims.size(), 1); for (int i = strides.size() - 2; i >= 0; i--) { @@ -25,8 +24,10 @@ VectorDims TileBroadcastCommon::calculateDenseStrides(const VectorDims &dims) { return strides; } -void TileBroadcastCommon::fillOptimizedDimsAndSrcStrides(const VectorDims& srcBlockedDims, const VectorDims& blockedRepeats, - VectorDims& optimizedDims, VectorDims& optimizedSrcStrides) { +void TileBroadcastCommon::fillOptimizedDimsAndSrcStrides(const VectorDims& srcBlockedDims, + const VectorDims& blockedRepeats, + VectorDims& optimizedDims, + VectorDims& optimizedSrcStrides) { optimizedDims.clear(); optimizedSrcStrides.clear(); VectorDims srcBlockedStrides = calculateDenseStrides(srcBlockedDims); @@ -60,10 +61,11 @@ void TileBroadcastCommon::fillOptimizedDimsAndSrcStrides(const VectorDims& srcBl } } -bool TileBroadcastCommon::canBeExecutedInBlockedLayout(VectorDims srcBlockedDims, VectorDims blockedRepeats, - const size_t elemsInBlock) { - if (srcBlockedDims.empty() || blockedRepeats.empty() || elemsInBlock == 0lu || srcBlockedDims[1] == Shape::UNDEFINED_DIM || - (blockedRepeats[1] != 1 && srcBlockedDims[1] % elemsInBlock != 0)) +bool TileBroadcastCommon::canBeExecutedInBlockedLayout(VectorDims srcBlockedDims, + VectorDims blockedRepeats, + const size_t elemsInBlock) { + if (srcBlockedDims.empty() || blockedRepeats.empty() || elemsInBlock == 0lu || + srcBlockedDims[1] == Shape::UNDEFINED_DIM || (blockedRepeats[1] != 1 && srcBlockedDims[1] % elemsInBlock != 0)) return false; srcBlockedDims[1] = div_up(srcBlockedDims[1], elemsInBlock); @@ -90,7 +92,7 @@ bool TileBroadcastCommon::canBeExecutedInNSPCLayout(VectorDims srcBlockedDims, V return optimizedDims.size() <= maxNDims; } -std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, size_t outSize) { +std::vector TileBroadcastCommon::getSupportedConfigs(const Node* node, size_t outSize) { std::vector supportedPrimitiveDescriptors; auto precision = node->getOriginalInputPrecisionAtPort(0); auto dataType = DnnlExtensionUtils::ElementTypeToDataType(precision); @@ -115,26 +117,31 @@ std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, config.inConfs[0].constant(constMap[0]); config.inConfs[1].inPlace(-1); config.inConfs[1].constant(constMap[1]); - config.inConfs[1].setMemDesc(std::make_shared(ov::element::i32, node->getInputShapeAtPort(1))); + config.inConfs[1].setMemDesc( + std::make_shared(ov::element::i32, node->getInputShapeAtPort(1))); if (config.inConfs.size() == 3) { config.inConfs[2].inPlace(-1); config.inConfs[2].constant(constMap[2]); - config.inConfs[2].setMemDesc(std::make_shared(ov::element::i32, node->getInputShapeAtPort(2))); + config.inConfs[2].setMemDesc( + std::make_shared(ov::element::i32, node->getInputShapeAtPort(2))); } config.outConfs.resize(outSize); auto pushDesc = [&](dnnl::memory::format_tag inFormat, dnnl::memory::format_tag outFormat) { - config.inConfs[0].setMemDesc(std::make_shared(node->getInputShapeAtPort(0), dataType, inFormat)); + config.inConfs[0].setMemDesc( + std::make_shared(node->getInputShapeAtPort(0), dataType, inFormat)); for (size_t i = 0; i < config.outConfs.size(); i++) { config.outConfs[i].inPlace(-1); config.outConfs[i].constant(false); - config.outConfs[i].setMemDesc(std::make_shared(node->getOutputShapeAtPort(0), dataType, outFormat)); + config.outConfs[i].setMemDesc( + std::make_shared(node->getOutputShapeAtPort(0), dataType, outFormat)); } supportedPrimitiveDescriptors.push_back({config, impl_desc_type::ref}); }; - if (!repeats.empty() && inDataShape.getRank() == outDataShapeRank && (outDataShapeRank == 4 || outDataShapeRank == 5)) { + if (!repeats.empty() && inDataShape.getRank() == outDataShapeRank && + (outDataShapeRank == 4 || outDataShapeRank == 5)) { if (canBeExecutedInBlockedLayout(srcDims, repeats, 16)) { if (outDataShapeRank == 4) { pushDesc(dnnl::memory::format_tag::nChw16c, dnnl::memory::format_tag::nChw16c); @@ -165,7 +172,8 @@ std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, for (size_t i = 0; i < config.outConfs.size(); i++) { config.outConfs[i].inPlace(-1); config.outConfs[i].constant(false); - config.outConfs[i].setMemDesc(std::make_shared(precision, node->getOutputShapeAtPort(i))); + config.outConfs[i].setMemDesc( + std::make_shared(precision, node->getOutputShapeAtPort(i))); } supportedPrimitiveDescriptors.push_back({config, impl_desc_type::ref}); } else { @@ -175,7 +183,9 @@ std::vector TileBroadcastCommon::getSupportedConfigs(const Node *node, return supportedPrimitiveDescriptors; } -bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& srcBlockedDims, VectorDims& dstBlockedDims) { +bool TileBroadcastCommon::prepareOptimizedParams(const Node* node, + VectorDims& srcBlockedDims, + VectorDims& dstBlockedDims) { while (srcBlockedDims.size() < dstBlockedDims.size()) { srcBlockedDims.insert(srcBlockedDims.begin(), 1); } @@ -186,7 +196,8 @@ bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& s blockedRepeats.push_back(1); } // for NSPC layouts - if (node->getBaseMemDescAtInputPort(0)->hasLayoutType(LayoutType::nspc) && one_of(node->getBaseMemDescAtInputPort(0)->getShape().getRank(), 4u, 5u)) { + if (node->getBaseMemDescAtInputPort(0)->hasLayoutType(LayoutType::nspc) && + one_of(node->getBaseMemDescAtInputPort(0)->getShape().getRank(), 4u, 5u)) { blockedRepeats.push_back(blockedRepeats[1]); blockedRepeats.erase(blockedRepeats.begin() + 1); } @@ -205,7 +216,8 @@ bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& s VectorDims optimizedDstStrides = calculateDenseStrides(optimizedDims); - size_t dataSize = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->getPrecision().size(); + size_t dataSize = + node->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].getMemDesc()->getPrecision().size(); for (size_t i = 0; i < optimizedDims.size(); i++) { optimizedSrcStrides[i] *= dataSize; optimizedDstStrides[i] *= dataSize; @@ -221,9 +233,9 @@ bool TileBroadcastCommon::prepareOptimizedParams(const Node *node, VectorDims& s // Broadcast 1 element to N continuous elements based on cpu_memcpy // Step 1: Get the binary format of the number N -// Step 2: Use cpu_memcpy to form fragments containing pow(2, k) (ie. 2, 4, 8, ...) elements, based on the given 1 element -// Step 3: Form N continuous elements, who's a combination of those fragments, demonstrated by its binary format -void TileBroadcastCommon::broadcastScalar(const char *srcData, char *dstData, size_t elt_cnt, size_t data_size) { +// Step 2: Use cpu_memcpy to form fragments containing pow(2, k) (ie. 2, 4, 8, ...) elements, based on the given 1 +// element Step 3: Form N continuous elements, who's a combination of those fragments, demonstrated by its binary format +void TileBroadcastCommon::broadcastScalar(const char* srcData, char* dstData, size_t elt_cnt, size_t data_size) { std::vector binary_digits; binary_digits.clear(); @@ -275,32 +287,44 @@ void TileBroadcastCommon::optimizedExecute(const MemoryPtr& srcMemory, const Mem broadcastScalar(srcData, dstData, elt_cnt, data_size); } } else { - parallel_for5d(optimizedParams.dims[0], optimizedParams.dims[1], optimizedParams.dims[2], optimizedParams.dims[3], optimizedParams.dims[4], - [&](int i0, int i1, int i2, int i3, int i4) { - auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + - i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + - i4 * optimizedParams.srcStrides[4]); - auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + - i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + - i4 * optimizedParams.dstStrides[4]); - for (size_t i = 0; i < optimizedParams.dims[5]; i++) { - cpu_memcpy(dstData2 + i * optimizedParams.dstStrides[5], srcData2, optimizedParams.dstStrides[5]); - } - }); + parallel_for5d( + optimizedParams.dims[0], + optimizedParams.dims[1], + optimizedParams.dims[2], + optimizedParams.dims[3], + optimizedParams.dims[4], + [&](int i0, int i1, int i2, int i3, int i4) { + auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + + i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + + i4 * optimizedParams.srcStrides[4]); + auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + + i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + + i4 * optimizedParams.dstStrides[4]); + for (size_t i = 0; i < optimizedParams.dims[5]; i++) { + cpu_memcpy(dstData2 + i * optimizedParams.dstStrides[5], + srcData2, + optimizedParams.dstStrides[5]); + } + }); } } else { - parallel_for5d(optimizedParams.dims[0], optimizedParams.dims[1], optimizedParams.dims[2], optimizedParams.dims[3], optimizedParams.dims[4], - [&](int i0, int i1, int i2, int i3, int i4) { - auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + - i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + - i4 * optimizedParams.srcStrides[4]); - auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + + parallel_for5d( + optimizedParams.dims[0], + optimizedParams.dims[1], + optimizedParams.dims[2], + optimizedParams.dims[3], + optimizedParams.dims[4], + [&](int i0, int i1, int i2, int i3, int i4) { + auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + + i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + + i4 * optimizedParams.srcStrides[4]); + auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + i4 * optimizedParams.dstStrides[4]); - cpu_memcpy(dstData2, srcData2, optimizedParams.copySize); - }); + cpu_memcpy(dstData2, srcData2, optimizedParams.copySize); + }); } } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h index 7ae0eacbccd373..6638eba7f88a39 100644 --- a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h +++ b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h @@ -9,27 +9,28 @@ #include #include - namespace ov { namespace intel_cpu { class TileBroadcastCommon { protected: - static VectorDims calculateDenseStrides(const VectorDims &dims); - std::vector getSupportedConfigs(const Node *node, size_t outSize); - bool prepareOptimizedParams(const Node *node, VectorDims& srcBlockedDims, VectorDims& dstBlockedDims); + static VectorDims calculateDenseStrides(const VectorDims& dims); + std::vector getSupportedConfigs(const Node* node, size_t outSize); + bool prepareOptimizedParams(const Node* node, VectorDims& srcBlockedDims, VectorDims& dstBlockedDims); void optimizedExecute(const MemoryPtr& srcMemory, const MemoryPtr& dstMemory); VectorDims repeats; bool optimizedCase = false; - bool constMap[3] = { false }; + bool constMap[3] = {false}; mutable bool needPrepareParamsVar = false; private: - static void fillOptimizedDimsAndSrcStrides(const VectorDims &srcBlockedDims, const VectorDims &blockedRepeats, - VectorDims &optimizedDims, VectorDims &optimizedSrcStrides); - static void broadcastScalar(const char *srcData, char *dstData, size_t elt_cnt, size_t data_size); + static void fillOptimizedDimsAndSrcStrides(const VectorDims& srcBlockedDims, + const VectorDims& blockedRepeats, + VectorDims& optimizedDims, + VectorDims& optimizedSrcStrides); + static void broadcastScalar(const char* srcData, char* dstData, size_t elt_cnt, size_t data_size); static bool canBeExecutedInBlockedLayout(VectorDims srcDims, VectorDims repeats, const size_t elemsInBlock); static bool canBeExecutedInNSPCLayout(VectorDims srcDims, VectorDims repeats); @@ -42,5 +43,5 @@ class TileBroadcastCommon { } optimizedParams; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/common/uni_simd.h b/src/plugins/intel_cpu/src/nodes/common/uni_simd.h index 7f2cdc7bed4821..dbcec60baa7d4c 100644 --- a/src/plugins/intel_cpu/src/nodes/common/uni_simd.h +++ b/src/plugins/intel_cpu/src/nodes/common/uni_simd.h @@ -5,7 +5,7 @@ #pragma once #if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) -#include +# include #endif namespace ov { @@ -14,348 +14,350 @@ namespace Cpu { #if defined(HAVE_AVX512F) namespace AVX512F { - static inline __m512 _mm_uni_any_ps() { - return __m512{}; - } - - static inline __m512i _mm_uni_any_epi32() { - return __m512i{}; - } - - static inline __m512 _mm_uni_loadu_ps(const float* psrc) { - return _mm512_mask_loadu_ps(_mm_uni_any_ps(), (__mmask16)-1, psrc); - } - - static inline void _mm_uni_storeu_ps(float* pdst, const __m512& vec) { - _mm512_storeu_ps(pdst, vec); - } - - static inline void _mm_uni_storeu_si(void* pdst, const __m512i vec) { - _mm512_storeu_si512(pdst, vec); - } - - static inline __m512 _mm_uni_setzero_ps() { - return _mm512_setzero_ps(); - } - - static inline __m512 _mm_uni_set1_ps(float value) { - return _mm512_set1_ps(value); - } - - static inline __m512 _mm_uni_add_ps(__m512 vec0, __m512 vec1) { - return _mm512_add_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_sub_ps(__m512 vec0, __m512 vec1) { - return _mm512_sub_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_mul_ps(__m512 vec0, __m512 vec1) { - return _mm512_mul_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_div_ps(__m512 vec0, __m512 vec1) { - return _mm512_div_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_sqrt_ps(__m512 vec) { - return _mm512_sqrt_ps(vec); - } - - static inline __m512 _mm_uni_and_ps(__m512 vec0, __m512 vec1) { - return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); - } - - static inline __m512 _mm_uni_or_ps(__m512 vec0, __m512 vec1) { - return _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); - } - - static inline __m512i _mm_uni_set1_epi32(int value) { - return _mm512_mask_set1_epi32(_mm_uni_any_epi32(), (__mmask16)-1, value); - } - - static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __m512 vmask) { - return _mm512_mask_blend_ps(_mm512_cmpneq_epi32_mask(_mm512_castps_si512(vmask), _mm_uni_set1_epi32(0)), vec0, vec1); - } - - static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __mmask16 vmask) { - return _mm512_mask_blend_ps(vmask, vec0, vec1); - } - - static inline __m512 _mm_uni_min_ps(__m512 vec0, __m512 vec1) { - return _mm512_min_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_max_ps(__m512 vec0, __m512 vec1) { - return _mm512_max_ps(vec0, vec1); - } - - static inline __m512 _mm_uni_floor_ps(__m512 vec) { - return _mm512_floor_ps(vec); - } - - static inline __m512i _mm_uni_cvtps_epi32(__m512 vec) { - return _mm512_cvtps_epi32(vec); - } - - static inline __m512i _mm_uni_add_epi32(__m512i vec0, __m512i vec1) { - return _mm512_add_epi32(vec0, vec1); - } - - static inline __m512i _mm_uni_slli_epi32(__m512i vec, int value) { - return _mm512_sll_epi32(vec, _mm_set1_epi64x(value)); - } - - static inline __m512 _mm_uni_castsi_ps(__m512i vec) { - return _mm512_castsi512_ps(vec); - } - - static inline __m512i _mm_uni_setzero_si() { - return _mm512_setzero_si512(); - } - - static inline __mmask16 _mm_uni_cmpgt_ps(__m512 vec0, __m512 vec1) { - return _mm512_cmp_ps_mask(vec0, vec1, 14); - } - - static inline __mmask16 _mm_uni_cmpgt_i32(__m512i vec0, __m512i vec1) { - return _mm512_cmp_epi32_mask(vec1, vec0, 1); - } - - static inline __m512i _mm_uni_castps_si(__m512 vec) { - return _mm512_castps_si512(vec); - } - - static inline __m512 _mm_uni_cvtepi32_ps(__m512i vec) { - return _mm512_mask_cvtepi32_ps(_mm_uni_any_ps(), (__mmask16)-1, vec); - } +static inline __m512 _mm_uni_any_ps() { + return __m512{}; +} + +static inline __m512i _mm_uni_any_epi32() { + return __m512i{}; +} + +static inline __m512 _mm_uni_loadu_ps(const float* psrc) { + return _mm512_mask_loadu_ps(_mm_uni_any_ps(), (__mmask16)-1, psrc); +} + +static inline void _mm_uni_storeu_ps(float* pdst, const __m512& vec) { + _mm512_storeu_ps(pdst, vec); +} + +static inline void _mm_uni_storeu_si(void* pdst, const __m512i vec) { + _mm512_storeu_si512(pdst, vec); +} + +static inline __m512 _mm_uni_setzero_ps() { + return _mm512_setzero_ps(); +} + +static inline __m512 _mm_uni_set1_ps(float value) { + return _mm512_set1_ps(value); +} + +static inline __m512 _mm_uni_add_ps(__m512 vec0, __m512 vec1) { + return _mm512_add_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_sub_ps(__m512 vec0, __m512 vec1) { + return _mm512_sub_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_mul_ps(__m512 vec0, __m512 vec1) { + return _mm512_mul_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_div_ps(__m512 vec0, __m512 vec1) { + return _mm512_div_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_sqrt_ps(__m512 vec) { + return _mm512_sqrt_ps(vec); +} + +static inline __m512 _mm_uni_and_ps(__m512 vec0, __m512 vec1) { + return _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); +} + +static inline __m512 _mm_uni_or_ps(__m512 vec0, __m512 vec1) { + return _mm512_castsi512_ps(_mm512_or_epi32(_mm512_castps_si512(vec0), _mm512_castps_si512(vec1))); +} + +static inline __m512i _mm_uni_set1_epi32(int value) { + return _mm512_mask_set1_epi32(_mm_uni_any_epi32(), (__mmask16)-1, value); +} + +static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __m512 vmask) { + return _mm512_mask_blend_ps(_mm512_cmpneq_epi32_mask(_mm512_castps_si512(vmask), _mm_uni_set1_epi32(0)), + vec0, + vec1); +} + +static inline __m512 _mm_uni_blendv_ps(__m512 vec0, __m512 vec1, __mmask16 vmask) { + return _mm512_mask_blend_ps(vmask, vec0, vec1); +} + +static inline __m512 _mm_uni_min_ps(__m512 vec0, __m512 vec1) { + return _mm512_min_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_max_ps(__m512 vec0, __m512 vec1) { + return _mm512_max_ps(vec0, vec1); +} + +static inline __m512 _mm_uni_floor_ps(__m512 vec) { + return _mm512_floor_ps(vec); +} + +static inline __m512i _mm_uni_cvtps_epi32(__m512 vec) { + return _mm512_cvtps_epi32(vec); +} + +static inline __m512i _mm_uni_add_epi32(__m512i vec0, __m512i vec1) { + return _mm512_add_epi32(vec0, vec1); +} + +static inline __m512i _mm_uni_slli_epi32(__m512i vec, int value) { + return _mm512_sll_epi32(vec, _mm_set1_epi64x(value)); +} + +static inline __m512 _mm_uni_castsi_ps(__m512i vec) { + return _mm512_castsi512_ps(vec); +} + +static inline __m512i _mm_uni_setzero_si() { + return _mm512_setzero_si512(); +} + +static inline __mmask16 _mm_uni_cmpgt_ps(__m512 vec0, __m512 vec1) { + return _mm512_cmp_ps_mask(vec0, vec1, 14); +} + +static inline __mmask16 _mm_uni_cmpgt_i32(__m512i vec0, __m512i vec1) { + return _mm512_cmp_epi32_mask(vec1, vec0, 1); +} + +static inline __m512i _mm_uni_castps_si(__m512 vec) { + return _mm512_castps_si512(vec); +} + +static inline __m512 _mm_uni_cvtepi32_ps(__m512i vec) { + return _mm512_mask_cvtepi32_ps(_mm_uni_any_ps(), (__mmask16)-1, vec); +} } // namespace AVX512F #elif defined(HAVE_AVX2) namespace AVX2 { - static inline __m256 _mm_uni_loadu_ps(const float* psrc) { - return _mm256_loadu_ps(psrc); - } +static inline __m256 _mm_uni_loadu_ps(const float* psrc) { + return _mm256_loadu_ps(psrc); +} - static inline void _mm_uni_storeu_ps(float* pdst, const __m256 vec) { - _mm256_storeu_ps(pdst, vec); - } +static inline void _mm_uni_storeu_ps(float* pdst, const __m256 vec) { + _mm256_storeu_ps(pdst, vec); +} - static inline void _mm_uni_storeu_si(__m256i* pdst, const __m256i vec) { - _mm256_storeu_si256(pdst, vec); - } +static inline void _mm_uni_storeu_si(__m256i* pdst, const __m256i vec) { + _mm256_storeu_si256(pdst, vec); +} - static inline __m256 _mm_uni_setzero_ps() { - return _mm256_setzero_ps(); - } +static inline __m256 _mm_uni_setzero_ps() { + return _mm256_setzero_ps(); +} - static inline __m256 _mm_uni_set1_ps(float value) { - return _mm256_set1_ps(value); - } +static inline __m256 _mm_uni_set1_ps(float value) { + return _mm256_set1_ps(value); +} - static inline __m256 _mm_uni_add_ps(__m256 vec0, __m256 vec1) { - return _mm256_add_ps(vec0, vec1); - } +static inline __m256 _mm_uni_add_ps(__m256 vec0, __m256 vec1) { + return _mm256_add_ps(vec0, vec1); +} - static inline __m256 _mm_uni_sub_ps(__m256 vec0, __m256 vec1) { - return _mm256_sub_ps(vec0, vec1); - } +static inline __m256 _mm_uni_sub_ps(__m256 vec0, __m256 vec1) { + return _mm256_sub_ps(vec0, vec1); +} - static inline __m256 _mm_uni_mul_ps(__m256 vec0, __m256 vec1) { - return _mm256_mul_ps(vec0, vec1); - } +static inline __m256 _mm_uni_mul_ps(__m256 vec0, __m256 vec1) { + return _mm256_mul_ps(vec0, vec1); +} - static inline __m256 _mm_uni_div_ps(__m256 vec0, __m256 vec1) { - return _mm256_div_ps(vec0, vec1); - } +static inline __m256 _mm_uni_div_ps(__m256 vec0, __m256 vec1) { + return _mm256_div_ps(vec0, vec1); +} - static inline __m256 _mm_uni_sqrt_ps(__m256 vec) { - return _mm256_sqrt_ps(vec); - } +static inline __m256 _mm_uni_sqrt_ps(__m256 vec) { + return _mm256_sqrt_ps(vec); +} - static inline __m256 _mm_uni_and_ps(__m256 vec0, __m256 vec1) { - return _mm256_and_ps(vec0, vec1); - } +static inline __m256 _mm_uni_and_ps(__m256 vec0, __m256 vec1) { + return _mm256_and_ps(vec0, vec1); +} - static inline __m256 _mm_uni_or_ps(__m256 vec0, __m256 vec1) { - return _mm256_or_ps(vec0, vec1); - } +static inline __m256 _mm_uni_or_ps(__m256 vec0, __m256 vec1) { + return _mm256_or_ps(vec0, vec1); +} - static inline __m256 _mm_uni_blendv_ps(__m256 vec0, __m256 vec1, __m256 vmask) { - return _mm256_blendv_ps(vec0, vec1, vmask); - } +static inline __m256 _mm_uni_blendv_ps(__m256 vec0, __m256 vec1, __m256 vmask) { + return _mm256_blendv_ps(vec0, vec1, vmask); +} - static inline __m256 _mm_uni_min_ps(__m256 vec0, __m256 vec1) { - return _mm256_min_ps(vec0, vec1); - } +static inline __m256 _mm_uni_min_ps(__m256 vec0, __m256 vec1) { + return _mm256_min_ps(vec0, vec1); +} - static inline __m256 _mm_uni_max_ps(__m256 vec0, __m256 vec1) { - return _mm256_max_ps(vec0, vec1); - } +static inline __m256 _mm_uni_max_ps(__m256 vec0, __m256 vec1) { + return _mm256_max_ps(vec0, vec1); +} - static inline __m256 _mm_uni_floor_ps(__m256 vec) { - return _mm256_floor_ps(vec); - } +static inline __m256 _mm_uni_floor_ps(__m256 vec) { + return _mm256_floor_ps(vec); +} - static inline __m256i _mm_uni_cvtps_epi32(__m256 vec) { - return _mm256_cvtps_epi32(vec); - } +static inline __m256i _mm_uni_cvtps_epi32(__m256 vec) { + return _mm256_cvtps_epi32(vec); +} - static inline __m256i _mm_uni_add_epi32(__m256i vec0, __m256i vec1) { - return _mm256_add_epi32(vec0, vec1); - } +static inline __m256i _mm_uni_add_epi32(__m256i vec0, __m256i vec1) { + return _mm256_add_epi32(vec0, vec1); +} - static inline __m256i _mm_uni_set1_epi32(int value) { - return _mm256_set1_epi32(value); - } +static inline __m256i _mm_uni_set1_epi32(int value) { + return _mm256_set1_epi32(value); +} - static inline __m256i _mm_uni_slli_epi32(__m256i vec, int value) { - return _mm256_slli_epi32(vec, value); - } +static inline __m256i _mm_uni_slli_epi32(__m256i vec, int value) { + return _mm256_slli_epi32(vec, value); +} - static inline __m256 _mm_uni_castsi_ps(__m256i vec) { - return _mm256_castsi256_ps(vec); - } +static inline __m256 _mm_uni_castsi_ps(__m256i vec) { + return _mm256_castsi256_ps(vec); +} - static inline __m256i _mm_uni_setzero_si() { - return _mm256_setzero_si256(); - } +static inline __m256i _mm_uni_setzero_si() { + return _mm256_setzero_si256(); +} - static inline __m256 _mm_uni_cmpgt_ps(__m256 vec0, __m256 vec1) { - return _mm256_cmp_ps(vec0, vec1, 14); - } +static inline __m256 _mm_uni_cmpgt_ps(__m256 vec0, __m256 vec1) { + return _mm256_cmp_ps(vec0, vec1, 14); +} - static inline __m256 _mm_uni_cmpgt_i32(__m256i vec0, __m256i vec1) { - return _mm256_cvtepi32_ps(_mm256_cmpgt_epi32(vec0, vec1)); - } +static inline __m256 _mm_uni_cmpgt_i32(__m256i vec0, __m256i vec1) { + return _mm256_cvtepi32_ps(_mm256_cmpgt_epi32(vec0, vec1)); +} - static inline __m256i _mm_uni_blendv_epi8(__m256i vec0, __m256i vec1, __m256i vmask) { - return _mm256_blendv_epi8(vec0, vec1, vmask); - } +static inline __m256i _mm_uni_blendv_epi8(__m256i vec0, __m256i vec1, __m256i vmask) { + return _mm256_blendv_epi8(vec0, vec1, vmask); +} - static inline __m256i _mm_uni_castps_si(__m256 vec) { - return _mm256_castps_si256(vec); - } +static inline __m256i _mm_uni_castps_si(__m256 vec) { + return _mm256_castps_si256(vec); +} - static inline __m256 _mm_uni_cvtepi32_ps(__m256i vec) { - return _mm256_cvtepi32_ps(vec); - } +static inline __m256 _mm_uni_cvtepi32_ps(__m256i vec) { + return _mm256_cvtepi32_ps(vec); +} - static inline int _mm_uni_movemask_ps(__m256 vec) { - return _mm256_movemask_ps(vec); - } +static inline int _mm_uni_movemask_ps(__m256 vec) { + return _mm256_movemask_ps(vec); +} } // namespace AVX2 #elif defined(HAVE_SSE42) namespace SSE42 { - static inline __m128 _mm_uni_loadu_ps(const float* psrc) { - return _mm_loadu_ps(psrc); - } +static inline __m128 _mm_uni_loadu_ps(const float* psrc) { + return _mm_loadu_ps(psrc); +} - static inline void _mm_uni_storeu_ps(float* pdst, const __m128 vec) { - _mm_storeu_ps(pdst, vec); - } - - static inline void _mm_uni_storeu_si(__m128i* pdst, const __m128i vec) { - _mm_storeu_si128(pdst, vec); - } - - static inline __m128 _mm_uni_setzero_ps() { - return _mm_setzero_ps(); - } - - static inline __m128 _mm_uni_set1_ps(float value) { - return _mm_set1_ps(value); - } - - static inline __m128 _mm_uni_add_ps(__m128 vec0, __m128 vec1) { - return _mm_add_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_sub_ps(__m128 vec0, __m128 vec1) { - return _mm_sub_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_mul_ps(__m128 vec0, __m128 vec1) { - return _mm_mul_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_div_ps(__m128 vec0, __m128 vec1) { - return _mm_div_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_sqrt_ps(__m128 vec) { - return _mm_sqrt_ps(vec); - } - - static inline __m128 _mm_uni_and_ps(__m128 vec0, __m128 vec1) { - return _mm_and_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_or_ps(__m128 vec0, __m128 vec1) { - return _mm_or_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_blendv_ps(__m128 vec0, __m128 vec1, __m128 vmask) { - return _mm_blendv_ps(vec0, vec1, vmask); - } - - static inline __m128 _mm_uni_min_ps(__m128 vec0, __m128 vec1) { - return _mm_min_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_max_ps(__m128 vec0, __m128 vec1) { - return _mm_max_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_floor_ps(__m128 vec) { - return _mm_floor_ps(vec); - } - - static inline __m128i _mm_uni_cvtps_epi32(__m128 vec) { - return _mm_cvtps_epi32(vec); - } - - static inline __m128i _mm_uni_add_epi32(__m128i vec0, __m128i vec1) { - return _mm_add_epi32(vec0, vec1); - } - - static inline __m128i _mm_uni_set1_epi32(int value) { - return _mm_set1_epi32(value); - } - - static inline __m128i _mm_uni_slli_epi32(__m128i vec, int value) { - return _mm_slli_epi32(vec, value); - } - - static inline __m128 _mm_uni_castsi_ps(__m128i vec) { - return _mm_castsi128_ps(vec); - } - - static inline __m128i _mm_uni_setzero_si() { - return _mm_setzero_si128(); - } - - static inline __m128 _mm_uni_cmpgt_ps(__m128 vec0, __m128 vec1) { - return _mm_cmpgt_ps(vec0, vec1); - } - - static inline __m128 _mm_uni_cmpgt_i32(__m128i vec0, __m128i vec1) { - return _mm_cvtepi32_ps(_mm_cmpgt_epi32(vec0, vec1)); - } - - static inline __m128i _mm_uni_blendv_epi8(__m128i vec0, __m128i vec1, __m128i vmask) { - return _mm_blendv_epi8(vec0, vec1, vmask); - } - - static inline __m128i _mm_uni_castps_si(__m128 vec) { - return _mm_castps_si128(vec); - } +static inline void _mm_uni_storeu_ps(float* pdst, const __m128 vec) { + _mm_storeu_ps(pdst, vec); +} + +static inline void _mm_uni_storeu_si(__m128i* pdst, const __m128i vec) { + _mm_storeu_si128(pdst, vec); +} + +static inline __m128 _mm_uni_setzero_ps() { + return _mm_setzero_ps(); +} + +static inline __m128 _mm_uni_set1_ps(float value) { + return _mm_set1_ps(value); +} + +static inline __m128 _mm_uni_add_ps(__m128 vec0, __m128 vec1) { + return _mm_add_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_sub_ps(__m128 vec0, __m128 vec1) { + return _mm_sub_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_mul_ps(__m128 vec0, __m128 vec1) { + return _mm_mul_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_div_ps(__m128 vec0, __m128 vec1) { + return _mm_div_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_sqrt_ps(__m128 vec) { + return _mm_sqrt_ps(vec); +} + +static inline __m128 _mm_uni_and_ps(__m128 vec0, __m128 vec1) { + return _mm_and_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_or_ps(__m128 vec0, __m128 vec1) { + return _mm_or_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_blendv_ps(__m128 vec0, __m128 vec1, __m128 vmask) { + return _mm_blendv_ps(vec0, vec1, vmask); +} + +static inline __m128 _mm_uni_min_ps(__m128 vec0, __m128 vec1) { + return _mm_min_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_max_ps(__m128 vec0, __m128 vec1) { + return _mm_max_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_floor_ps(__m128 vec) { + return _mm_floor_ps(vec); +} + +static inline __m128i _mm_uni_cvtps_epi32(__m128 vec) { + return _mm_cvtps_epi32(vec); +} + +static inline __m128i _mm_uni_add_epi32(__m128i vec0, __m128i vec1) { + return _mm_add_epi32(vec0, vec1); +} + +static inline __m128i _mm_uni_set1_epi32(int value) { + return _mm_set1_epi32(value); +} + +static inline __m128i _mm_uni_slli_epi32(__m128i vec, int value) { + return _mm_slli_epi32(vec, value); +} + +static inline __m128 _mm_uni_castsi_ps(__m128i vec) { + return _mm_castsi128_ps(vec); +} + +static inline __m128i _mm_uni_setzero_si() { + return _mm_setzero_si128(); +} + +static inline __m128 _mm_uni_cmpgt_ps(__m128 vec0, __m128 vec1) { + return _mm_cmpgt_ps(vec0, vec1); +} + +static inline __m128 _mm_uni_cmpgt_i32(__m128i vec0, __m128i vec1) { + return _mm_cvtepi32_ps(_mm_cmpgt_epi32(vec0, vec1)); +} + +static inline __m128i _mm_uni_blendv_epi8(__m128i vec0, __m128i vec1, __m128i vmask) { + return _mm_blendv_epi8(vec0, vec1, vmask); +} + +static inline __m128i _mm_uni_castps_si(__m128 vec) { + return _mm_castps_si128(vec); +} - static inline __m128 _mm_uni_cvtepi32_ps(__m128i vec) { - return _mm_cvtepi32_ps(vec); - } - static inline int _mm_uni_movemask_ps(__m128 vec) { - return _mm_movemask_ps(vec); - } +static inline __m128 _mm_uni_cvtepi32_ps(__m128i vec) { + return _mm_cvtepi32_ps(vec); +} +static inline int _mm_uni_movemask_ps(__m128 vec) { + return _mm_movemask_ps(vec); +} } // namespace SSE42 #endif diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index a1ceabd6942db1..616d3df6950e9a 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -4,11 +4,11 @@ #include "composite.h" -#include "nodes/input.h" #include "cpu_memory.h" +#include "nodes/input.h" +#include "shape_inference/shape_inference_internal_dyn.hpp" #include "transformations/cpu_opset/common/op/submodel.hpp" #include "utils/debug_capabilities.h" -#include "shape_inference/shape_inference_internal_dyn.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/concat.cpp b/src/plugins/intel_cpu/src/nodes/concat.cpp index 635f37b2d05b3a..ef621947d723a7 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.cpp +++ b/src/plugins/intel_cpu/src/nodes/concat.cpp @@ -4,29 +4,29 @@ #include "concat.h" -#include "openvino/op/concat.hpp" +#include +#include +#include +#include +#include #include #include #include -#include "dnnl_extension_utils.h" +#include "common/blocked_desc_creator.h" +#include "common/cpu_memcpy.h" +#include "dnnl_extension_utils.h" #include "onednn/dnnl.h" -#include -#include -#include #include "openvino/core/parallel.hpp" -#include "common/cpu_memcpy.h" -#include "common/blocked_desc_creator.h" -#include -#include +#include "openvino/op/concat.hpp" using namespace dnnl; namespace ov { namespace intel_cpu { namespace node { namespace { - constexpr size_t channelAxis = 1lu; +constexpr size_t channelAxis = 1lu; } bool Concat::isExecutable() const { @@ -86,11 +86,14 @@ void Concat::getSupportedDescriptors() { } } - // we need the first dims before axis to be 1 to avoid the reorder in the edge between the first parent and this concat + // we need the first dims before axis to be 1 to avoid the reorder in the edge between the first parent and this + // concat const auto& childDims = outputShapes[0].getDims(); if (childDims[axis] != Shape::UNDEFINED_DIM && - std::all_of(childDims.begin(), childDims.begin() + axis, [](size_t dim) { return dim == 1; })) + std::all_of(childDims.begin(), childDims.begin() + axis, [](size_t dim) { + return dim == 1; + })) canBeInPlace = true; } @@ -118,11 +121,11 @@ void Concat::initSupportedPrimitiveDescriptors() { const auto& dstShape = getOutputShapeAtPort(0); std::vector tdCreatorTypes = {LayoutType::ncsp, LayoutType::nspc}; - // check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow oneDNN ref implementation and allow - // inPlace memory usage if possible + // check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow + // oneDNN ref implementation and allow inPlace memory usage if possible if (dstShape.getRank() > channelAxis) { - for (auto& item : { std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c)}) { - const VectorDims &blkDims = dstShape.getDims(); + for (auto& item : {std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c)}) { + const VectorDims& blkDims = dstShape.getDims(); if (blkDims[channelAxis] == Shape::UNDEFINED_DIM || blkDims[channelAxis] % item.first != 0) continue; @@ -144,7 +147,8 @@ void Concat::initSupportedPrimitiveDescriptors() { auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - auto itrRange = BlockedDescCreator::makeFilteredRange(creatorsMap, static_cast(dstShape.getRank()), tdCreatorTypes); + auto itrRange = + BlockedDescCreator::makeFilteredRange(creatorsMap, static_cast(dstShape.getRank()), tdCreatorTypes); for (auto itr = itrRange.first; itr != itrRange.second; ++itr) { NodeConfig config; @@ -183,12 +187,15 @@ void Concat::initSupportedPrimitiveDescriptors() { } } - if (!canBeInPlace || std::any_of(inputShapes.begin(), inputShapes.end(), [](const Shape& shape) { return shape.hasZeroDims(); })) + if (!canBeInPlace || std::any_of(inputShapes.begin(), inputShapes.end(), [](const Shape& shape) { + return shape.hasZeroDims(); + })) return; // Optimized inplace case for (auto refPdIndex : pdIndexesToReuse) { - auto config = supportedPrimitiveDescriptors[refPdIndex].getConfig();; + auto config = supportedPrimitiveDescriptors[refPdIndex].getConfig(); + ; for (size_t i = 0; i < config.inConfs.size(); i++) { config.inConfs[i].inPlace(0); } @@ -204,12 +211,16 @@ void Concat::selectOptimalPrimitiveDescriptor() { // for that case. for (size_t i = 0; i < getParentEdges().size(); i++) { for (size_t j = i + 1; j < getParentEdges().size(); j++) { - if (getParentEdgeAt(i) == getParentEdgeAt(j)) canBeInPlace = false; + if (getParentEdgeAt(i) == getParentEdgeAt(j)) + canBeInPlace = false; } } std::map formatFrequency; - std::vector supportedLayouts = {LayoutType::ncsp, LayoutType::nspc, LayoutType::nCsp8c, LayoutType::nCsp16c}; + std::vector supportedLayouts = {LayoutType::ncsp, + LayoutType::nspc, + LayoutType::nCsp8c, + LayoutType::nCsp16c}; for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); auto parent = parentEdge->getParent(); @@ -218,11 +229,11 @@ void Concat::selectOptimalPrimitiveDescriptor() { if (parent_pdesc == nullptr) continue; - const auto &parent_config = parent_pdesc->getConfig(); + const auto& parent_config = parent_pdesc->getConfig(); int outputIndex = parentEdge->getInputNum(); if (outputIndex < 0 || outputIndex >= static_cast(parent_config.outConfs.size())) OPENVINO_THROW("Cannot find index of output node"); - const auto &port_desc = parent_config.outConfs[outputIndex].getMemDesc(); + const auto& port_desc = parent_config.outConfs[outputIndex].getMemDesc(); for (auto& item : supportedLayouts) { if (port_desc->hasLayoutType(item)) { formatFrequency[item] += 1; @@ -232,15 +243,15 @@ void Concat::selectOptimalPrimitiveDescriptor() { for (size_t i = 0; i < getChildEdges().size(); i++) { auto childEdge = getChildEdgeAt(i); auto child = childEdge->getChild(); - const auto *prim_desc = child->getSelectedPrimitiveDescriptor(); + const auto* prim_desc = child->getSelectedPrimitiveDescriptor(); if (prim_desc == nullptr) continue; - const auto &config = prim_desc->getConfig(); + const auto& config = prim_desc->getConfig(); int inputIndex = childEdge->getOutputNum(); if (inputIndex < 0 || inputIndex >= static_cast(config.inConfs.size())) OPENVINO_THROW("Cannot find index of output node"); - const auto &port_desc = config.inConfs[inputIndex].getMemDesc(); + const auto& port_desc = config.inConfs[inputIndex].getMemDesc(); for (auto& item : supportedLayouts) { if (port_desc->hasLayoutType(item)) { formatFrequency[item] += 1; @@ -249,9 +260,9 @@ void Concat::selectOptimalPrimitiveDescriptor() { } size_t maxCount = 0; - const auto &outDims = getOutputShapeAtPort(0).getDims(); + const auto& outDims = getOutputShapeAtPort(0).getDims(); LayoutType convertTo = LayoutType::ncsp; - for (auto &it : formatFrequency) { + for (auto& it : formatFrequency) { if (it.second > maxCount) { maxCount = it.second; convertTo = it.first; @@ -264,7 +275,7 @@ void Concat::selectOptimalPrimitiveDescriptor() { } } - for (auto& item : { std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c) }) { + for (auto& item : {std::make_pair(8lu, LayoutType::nCsp8c), std::make_pair(16lu, LayoutType::nCsp16c)}) { if (convertTo == item.second) { if (outDims[channelAxis] == Shape::UNDEFINED_DIM || outDims[1] % item.first != 0) { convertTo = LayoutType::ncsp; @@ -282,7 +293,8 @@ void Concat::selectOptimalPrimitiveDescriptor() { for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) { if (supportedPrimitiveDescriptors[i].getConfig().outConfs[0].getMemDesc()->hasLayoutType(convertTo)) { - if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, canBeInPlace)) { + if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, + canBeInPlace)) { canSelectPrimitive.push_back(i); } } @@ -444,24 +456,26 @@ void Concat::initOptimalPrimitiveDescriptor() { if (selected_pd == nullptr) OPENVINO_THROW("Preferable primitive descriptor is not set."); - if (!isInPlace()) { - Node::initOptimalPrimitiveDescriptor(); + if (!isInPlace()) { + Node::initOptimalPrimitiveDescriptor(); auto config = selected_pd->getConfig(); if (!isConfigDefined(config)) { for (size_t i = 0; i < config.inConfs.size(); i++) { // Concat doesn't support different precision on inputs - config.inConfs[i].setMemDesc(getConsistentInputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(inputPrecision)); + config.inConfs[i].setMemDesc( + getConsistentInputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(inputPrecision)); } for (size_t i = 0; i < config.outConfs.size(); i++) { - config.outConfs[i].setMemDesc(getConsistentOutputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(outputPrecision)); + config.outConfs[i].setMemDesc( + getConsistentOutputDesc(config, i)->getMemDesc()->cloneWithNewPrecision(outputPrecision)); } initDescriptor(config); } } - //block layout may have axis greater than rank, disable ref_concat + // block layout may have axis greater than rank, disable ref_concat auto primDesc = getSelectedPrimitiveDescriptor(); auto memDesc = primDesc->getConfig().outConfs[0].getMemDesc()->as(); auto rank = memDesc->getShape().getRank(); @@ -474,7 +488,9 @@ void Concat::initOptimalPrimitiveDescriptor() { srcPtrs.resize(getParentEdges().size()); } // check if selected Tensor descriptor has nspc layout and concat axis is C - canOptimizeNspc = axis == channelAxis && getSelectedPrimitiveDescriptor()->getConfig().outConfs.front().getMemDesc()->hasLayoutType(LayoutType::nspc); + canOptimizeNspc = + axis == channelAxis && + getSelectedPrimitiveDescriptor()->getConfig().outConfs.front().getMemDesc()->hasLayoutType(LayoutType::nspc); } void Concat::execute(dnnl::stream strm) { @@ -497,7 +513,7 @@ void Concat::execute(dnnl::stream strm) { } else { const auto& dst_memory = getChildEdgeAt(0)->getMemory(); const size_t num_src = getParentEdges().size(); - std::unordered_map mem_ags {{DNNL_ARG_DST, dst_memory.getPrimitive()}}; + std::unordered_map mem_ags{{DNNL_ARG_DST, dst_memory.getPrimitive()}}; size_t nonZeroInShapes = 0; for (size_t i = 0; i < num_src; i++) { const auto& srcMem = getParentEdgeAt(i)->getMemory(); @@ -580,7 +596,7 @@ void Concat::execRef() { } if (!hasOuterLoop) { - if (nelemTotal < 64*1024 || parallel_get_max_threads() == 1) { + if (nelemTotal < 64 * 1024 || parallel_get_max_threads() == 1) { for (size_t a = 0; a < srcPtrs.size(); ++a) { const auto inData = srcPtrs[a]; auto outputData = &dstPtr[dstOffset[a]]; @@ -612,63 +628,65 @@ void Concat::execRef() { physDims[i] = outputShape[i]; } const auto L1Size = dnnl::utils::get_cache_size(1, true); - UNUSED(L1Size); // for Windows - parallel_for6d(physDims[0], physDims[1], physDims[2], physDims[3], physDims[4], numSrc, - [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) { - // check if zero memory - if (srcPtrs[a] == nullptr) return; - - size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 - + inputStrides[a][3] * n3 + inputStrides[a][4] * n4; - size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 - + outputStrides[3] * n3 + outputStrides[4] * n4; - const uint8_t *i = &srcPtrs[a][inOff]; - uint8_t *o = &dstPtr[dstOffset[a] + outOff]; + UNUSED(L1Size); // for Windows + parallel_for6d(physDims[0], + physDims[1], + physDims[2], + physDims[3], + physDims[4], + numSrc, + [&](size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, size_t a) { + // check if zero memory + if (srcPtrs[a] == nullptr) + return; + + size_t inOff = inputStrides[a][0] * n0 + inputStrides[a][1] * n1 + inputStrides[a][2] * n2 + + inputStrides[a][3] * n3 + inputStrides[a][4] * n4; + size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2 + + outputStrides[3] * n3 + outputStrides[4] * n4; + const uint8_t* i = &srcPtrs[a][inOff]; + uint8_t* o = &dstPtr[dstOffset[a] + outOff]; #if defined(__GNUC__) - // Heuristic: - // memcpy works generally faster for data sizes not - // exceeding L1 cache. - if (nelemToCopy[a] > L1Size) { - // The code below performs data copying: o[e] = i[e] - // and uses a workaround to make GNU compilers optimize it - uint8_t *ptro = o; - const uint8_t *ptri = i; - // head part: bytes before 4 byte-align's address - const size_t headPart = sizeof(uint32_t) - - reinterpret_cast(ptro) - % sizeof(uint32_t); - - // main part: bytes in 4 byte-align - const size_t mainPart - = (nelemToCopy[a] - headPart) / sizeof(uint32_t); - // tail part: bytes after 4 byte-align - const size_t tailPart - = (nelemToCopy[a]) - headPart - - (mainPart * sizeof(uint32_t)); - // copy head part - for (size_t e = 0; e < headPart; ++e) { - *ptro = *ptri; - ++ptro; - ++ptri; - } - // copy main part - std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t)); - ptro += mainPart * sizeof(uint32_t); - ptri += mainPart * sizeof(uint32_t); - // copy tail part - for (size_t e = 0; e < tailPart; ++e) { - *ptro = *ptri; - ++ptro; - ++ptri; - } - } else { - std::memcpy(o, i, nelemToCopy[a]); - } + // Heuristic: + // memcpy works generally faster for data sizes not + // exceeding L1 cache. + if (nelemToCopy[a] > L1Size) { + // The code below performs data copying: o[e] = i[e] + // and uses a workaround to make GNU compilers optimize it + uint8_t* ptro = o; + const uint8_t* ptri = i; + // head part: bytes before 4 byte-align's address + const size_t headPart = + sizeof(uint32_t) - reinterpret_cast(ptro) % sizeof(uint32_t); + + // main part: bytes in 4 byte-align + const size_t mainPart = (nelemToCopy[a] - headPart) / sizeof(uint32_t); + // tail part: bytes after 4 byte-align + const size_t tailPart = (nelemToCopy[a]) - headPart - (mainPart * sizeof(uint32_t)); + // copy head part + for (size_t e = 0; e < headPart; ++e) { + *ptro = *ptri; + ++ptro; + ++ptri; + } + // copy main part + std::memcpy(ptro, ptri, mainPart * sizeof(uint32_t)); + ptro += mainPart * sizeof(uint32_t); + ptri += mainPart * sizeof(uint32_t); + // copy tail part + for (size_t e = 0; e < tailPart; ++e) { + *ptro = *ptri; + ++ptro; + ++ptri; + } + } else { + std::memcpy(o, i, nelemToCopy[a]); + } #else std::memcpy(o, i, nelemToCopy[a]); #endif - }); + }); } } @@ -691,8 +709,10 @@ void Concat::resolveInPlaceEdges(Edge::LOOK look) { " can't use inPlace memory with concatenation on dynamic dimension"); auto edges = getChildEdgesAtPort(inplaceOutIndx); - auto itr = std::find_if(edges.begin(), edges.end(), [](const EdgePtr& edge) { return edge->getStatus() == Edge::Status::Allocated; }); - OPENVINO_ASSERT(itr != edges.end(), " Could not find allocated child edge for concat node: " , getName()); + auto itr = std::find_if(edges.begin(), edges.end(), [](const EdgePtr& edge) { + return edge->getStatus() == Edge::Status::Allocated; + }); + OPENVINO_ASSERT(itr != edges.end(), " Could not find allocated child edge for concat node: ", getName()); auto baseMemBlock = (*itr)->getMemory().getMemoryBlock(); OPENVINO_ASSERT(baseMemBlock != nullptr, " NULL base memory block in concat node: ", getName()); @@ -726,6 +746,6 @@ void Concat::resolveInPlaceEdges(Edge::LOOK look) { } } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/concat.h b/src/plugins/intel_cpu/src/nodes/concat.h index 9ed331bee4f16d..8b75e3839a372d 100644 --- a/src/plugins/intel_cpu/src/nodes/concat.h +++ b/src/plugins/intel_cpu/src/nodes/concat.h @@ -4,8 +4,8 @@ #pragma once -#include "node.h" #include "graph_context.h" +#include "node.h" namespace ov { namespace intel_cpu { @@ -22,7 +22,9 @@ class Concat : public Node { void selectOptimalPrimitiveDescriptor() override; bool created() const override; void execute(dnnl::stream strm) override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } void resolveInPlaceEdges(Edge::LOOK look) override; ov::element::Type getRuntimePrecision() const override; @@ -42,9 +44,9 @@ class Concat : public Node { void execNspcSpecCase(); void exec1DCase(); std::vector inputStrides; - std::vector nelemToCopy; // byte moved in each iter + std::vector nelemToCopy; // byte moved in each iter size_t nelemTotal = 0; - std::vector dstOffset; // dst offset for each input + std::vector dstOffset; // dst offset for each input std::vector srcPtrs; bool hasOuterLoop = false; ov::element::Type inputPrecision = ov::element::f32; @@ -54,6 +56,6 @@ class Concat : public Node { dnnl::primitive prim; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 7cf7698e989343..4cb2dc9058551f 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -4,8 +4,11 @@ #include "conv.h" -#include "openvino/op/convolution.hpp" -#include "openvino/op/group_conv.hpp" +#include +#include +#include +#include + #include "common/c_types_map.hpp" #include "common/cpu_convert.h" #include "common/primitive_desc.hpp" @@ -27,17 +30,14 @@ #include "oneapi/dnnl/dnnl_common.hpp" #include "oneapi/dnnl/dnnl_types.h" #include "onednn/dnnl.h" +#include "openvino/op/convolution.hpp" +#include "openvino/op/group_conv.hpp" #include "pooling.h" #include "reorder.h" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" -#include -#include -#include -#include - using namespace dnnl; namespace ov { @@ -88,7 +88,7 @@ size_t ConvKey::hash() const { return seed; } -bool ConvKey::operator==(const ConvKey &rhs) const { +bool ConvKey::operator==(const ConvKey& rhs) const { bool retVal = true; if (inp0 != rhs.inp0) { retVal = retVal && inp0 && rhs.inp0 && inp0->getDnnlDesc() == rhs.inp0->getDnnlDesc(); @@ -112,11 +112,11 @@ bool ConvKey::operator==(const ConvKey &rhs) const { return retVal; } -} // namespace +} // namespace class Convolution::FusedSubgraph { public: - FusedSubgraph(const std::vector &opList, const Convolution &conv, const GraphContext::CPtr context) { + FusedSubgraph(const std::vector& opList, const Convolution& conv, const GraphContext::CPtr context) { _graph = std::unique_ptr(new Graph()); std::unordered_set nodesSet; @@ -130,16 +130,16 @@ class Convolution::FusedSubgraph { nodesSet.insert(child); }; - //Make inputs - const auto &inpMemDesc1 = conv.getBaseMemDescAtOutputPort(0); + // Make inputs + const auto& inpMemDesc1 = conv.getBaseMemDescAtOutputPort(0); auto inp0 = std::make_shared(inpMemDesc1, "inp0", "Parameter", context); inputs.push_back(inp0); const size_t sumPortNum = conv.getParentEdges().size() - 1; - const auto &inpMemDesc2 = conv.getBaseMemDescAtInputPort(sumPortNum); + const auto& inpMemDesc2 = conv.getBaseMemDescAtInputPort(sumPortNum); auto inp1 = std::make_shared(inpMemDesc2, "inp1", "Parameter", context); inputs.push_back(inp1); - auto itr = std::find_if(opList.begin(), opList.end(), [](const NodePtr &node) { + auto itr = std::find_if(opList.begin(), opList.end(), [](const NodePtr& node) { if (auto eltwise = std::dynamic_pointer_cast(node)) { return eltwise->isSpecialConvolutionAddFusing(); } @@ -153,7 +153,7 @@ class Convolution::FusedSubgraph { addEdge(inp0, sumNode, 0, 0); addEdge(inp1, sumNode, 0, 1); - //Replicate the rest of the subgraph + // Replicate the rest of the subgraph auto parentItr = itr; while (++itr != opList.end()) { auto parentNode = *parentItr; @@ -173,8 +173,8 @@ class Convolution::FusedSubgraph { } } - //Make output - const auto &outMemDesc = conv.getBaseMemDescAtOutputPort(0); + // Make output + const auto& outMemDesc = conv.getBaseMemDescAtOutputPort(0); auto out = std::make_shared(outMemDesc, "out", "Result", context); addEdge(*parentItr, out, 0, 0); outputs.push_back(out); @@ -240,9 +240,20 @@ bool Convolution::isSupportedOperation(const std::shared_ptr& op } Convolution::Convolution(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, NgraphShapeInferFactory(op)), withBiases(false), withSum(false), withDWConv(false), - isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::undef), - groupNum(1lu), IC(1), groupIC(1), groupOC(1), eltwisePrecision(ov::element::f32) { + : Node(op, context, NgraphShapeInferFactory(op)), + withBiases(false), + withSum(false), + withDWConv(false), + isGrouped(false), + dw_conv_oc(0), + dw_conv_ih(0), + dw_conv_iw(0), + dw_conv_in_dt(memory::data_type::undef), + groupNum(1lu), + IC(1), + groupIC(1), + groupOC(1), + eltwisePrecision(ov::element::f32) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -296,12 +307,12 @@ Convolution::Convolution(const std::shared_ptr& op, const GraphContext } paddingL = groupConvolutionOp->get_pads_begin(); paddingR = groupConvolutionOp->get_pads_end(); - autoPadding = one_of(groupConvolutionOp->get_auto_pad(), ov::op::PadType::SAME_UPPER, ov::op::PadType::SAME_LOWER); + autoPadding = + one_of(groupConvolutionOp->get_auto_pad(), ov::op::PadType::SAME_UPPER, ov::op::PadType::SAME_LOWER); } // Only apply this heuristic logic on FP32 IR. IC=1 ,OC=1 would disable brgconv on avx2. const bool isAvx2FP32 = !dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) && - dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && - !context->isGraphQuantized(); + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && !context->isGraphQuantized(); useJitPlanar = ((IC == 1 && groupOC * groupNum == 1) && isAvx2FP32); } @@ -315,7 +326,8 @@ bool Convolution::canBeExecutedInInt8() const { if (!legacyWeightsZeroPoints.empty()) weightsDataType = memory::data_type::s8; - return one_of(inputDataType, memory::data_type::u8, memory::data_type::s8) && weightsDataType == memory::data_type::s8; + return one_of(inputDataType, memory::data_type::u8, memory::data_type::s8) && + weightsDataType == memory::data_type::s8; } ov::element::Type Convolution::fusedEltwisePrecision(const NodePtr& fusingNode) const { @@ -338,60 +350,63 @@ ov::element::Type Convolution::fusedEltwisePrecision(const NodePtr& fusingNode) const std::vector& Convolution::getDefaultImplPriority() { static const std::vector priorities = { - impl_desc_type::unknown, - impl_desc_type::dw_acl, - impl_desc_type::winograd_acl, - impl_desc_type::gemm_acl, - impl_desc_type::acl, - impl_desc_type::brgconv_avx512_amx_1x1, - impl_desc_type::brgconv_avx512_amx, - impl_desc_type::jit_avx512_amx_dw, - impl_desc_type::jit_avx512_amx_1x1, - impl_desc_type::jit_avx512_amx, - impl_desc_type::brgconv_avx512_1x1, - impl_desc_type::brgconv_avx512, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::brgconv_avx2_1x1, - impl_desc_type::brgconv_avx2, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, - impl_desc_type::gemm_any, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::jit_gemm, - impl_desc_type::ref_any, - impl_desc_type::ref, - }; - if (isBrgConvAvailable()) - return priorities; - - static const std::vector priorities_wo_brgemm = [&] { - std::vectorresult; - std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), - [](impl_desc_type type) { return !(type & impl_desc_type::brgconv); }); - return result;}(); - return priorities_wo_brgemm; + impl_desc_type::unknown, + impl_desc_type::dw_acl, + impl_desc_type::winograd_acl, + impl_desc_type::gemm_acl, + impl_desc_type::acl, + impl_desc_type::brgconv_avx512_dw, + impl_desc_type::brgconv_avx512_amx_1x1, + impl_desc_type::brgconv_avx512_amx, + impl_desc_type::jit_avx512_amx_dw, + impl_desc_type::jit_avx512_amx_1x1, + impl_desc_type::jit_avx512_amx, + impl_desc_type::brgconv_avx512_1x1, + impl_desc_type::brgconv_avx512, + impl_desc_type::jit_avx512_dw, + impl_desc_type::jit_avx512_1x1, + impl_desc_type::jit_avx512, + impl_desc_type::brgconv_avx2_dw, + impl_desc_type::brgconv_avx2_1x1, + impl_desc_type::brgconv_avx2, + impl_desc_type::jit_uni_dw, + impl_desc_type::jit_uni_1x1, + impl_desc_type::jit_uni, + impl_desc_type::jit_avx2_dw, + impl_desc_type::jit_avx2_1x1, + impl_desc_type::jit_avx2, + impl_desc_type::jit_avx_dw, + impl_desc_type::jit_avx_1x1, + impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, + impl_desc_type::jit_sse42_1x1, + impl_desc_type::jit_sse42, + impl_desc_type::gemm_any, + impl_desc_type::gemm_blas, + impl_desc_type::gemm_avx512, + impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, + impl_desc_type::gemm_sse42, + impl_desc_type::jit_gemm, + impl_desc_type::ref_any, + impl_desc_type::ref, + }; + if (isBrgConvAvailable()) + return priorities; + + static const std::vector priorities_wo_brgemm = [&] { + std::vector result; + std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), [](impl_desc_type type) { + return !(type & impl_desc_type::brgconv); + }); + return result; + }(); + return priorities_wo_brgemm; } const bool Convolution::isBrgConvAvailable() { - //When avx2 brgconv heuristic case, disable brgconv to WA the regression. - const bool isBrgConvAvailable = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && - !useJitPlanar; + // When avx2 brgconv heuristic case, disable brgconv to WA the regression. + const bool isBrgConvAvailable = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && !useJitPlanar; return isBrgConvAvailable; } @@ -411,7 +426,7 @@ void Convolution::getSupportedDescriptors() { } if (fusedWith[i]->getAlgorithm() == Algorithm::EltwiseAdd) { - auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { expectedInputEdgesNum++; } @@ -425,17 +440,19 @@ void Convolution::getSupportedDescriptors() { outputDataType = DnnlExtensionUtils::ElementTypeToDataType(getOriginalOutputPrecisionAtPort(0)); eltwisePrecision = DnnlExtensionUtils::DataTypeToElementType(outputDataType); if (!fusedWith.empty()) { - outputDataType = DnnlExtensionUtils::ElementTypeToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); + outputDataType = DnnlExtensionUtils::ElementTypeToDataType( + fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); eltwisePrecision = DnnlExtensionUtils::DataTypeToElementType(outputDataType); } // We need to make sure that convolution output and second input of fused Eltwise operation - // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32. + // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale + // to FP32. if (outputDataType != memory::data_type::f32 && outputDataType != memory::data_type::bf16 && outputDataType != memory::data_type::f16 && withSum) { for (size_t i = 0; i < fusedWith.size(); i++) { if (fusedWith[i]->getAlgorithm() == Algorithm::EltwiseAdd) { - auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { eltwisePrecision = fusedEltwisePrecision(fusedWith[i]); if (DnnlExtensionUtils::DataTypeToElementType(outputDataType).size() != eltwisePrecision.size()) { @@ -466,7 +483,7 @@ void Convolution::getSupportedDescriptors() { } for (size_t i = 0; i < fusedWith.size(); i++) { - auto *convolutionNode = dynamic_cast(fusedWith[i].get()); + auto* convolutionNode = dynamic_cast(fusedWith[i].get()); if (convolutionNode) { auto& inActivationDims = convolutionNode->inputShapes[0].getStaticDims(); dw_conv_ih = inActivationDims[convolutionNode->inputShapes[0].getRank() - 2]; @@ -475,7 +492,7 @@ void Convolution::getSupportedDescriptors() { auto& outDims = convolutionNode->outputShapes[0].getStaticDims(); dw_conv_oc = outDims[1]; - const auto &dwWeightsDims = convolutionNode->inputShapes[1].getStaticDims(); + const auto& dwWeightsDims = convolutionNode->inputShapes[1].getStaticDims(); dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 1]); dw_conv_kernel.push_back(dwWeightsDims[dwWeightsDims.size() - 2]); dw_conv_strides = convolutionNode->getStride(); @@ -484,7 +501,8 @@ void Convolution::getSupportedDescriptors() { if (i == 0) { dw_conv_in_dt = DnnlExtensionUtils::ElementTypeToDataType(getOriginalOutputPrecisionAtPort(0)); } else { - dw_conv_in_dt = DnnlExtensionUtils::ElementTypeToDataType(fusedWith[i - 1]->getOriginalOutputPrecisionAtPort(0)); + dw_conv_in_dt = DnnlExtensionUtils::ElementTypeToDataType( + fusedWith[i - 1]->getOriginalOutputPrecisionAtPort(0)); } } else { dw_conv_in_dt = memory::data_type::f32; @@ -496,7 +514,7 @@ void Convolution::getSupportedDescriptors() { int src = getInputShapeAtPort(0).getStaticDims()[2 + j]; int dst = getOutputShapeAtPort(0).getStaticDims()[2 + j]; - krn = (krn - 1)*(dilation[j] + 1) + 1; + krn = (krn - 1) * (dilation[j] + 1) + 1; int calc_dst = (src - krn + paddingL[j]) / stride[j] + 1; paddingR[j] = (dst - calc_dst) * stride[j]; } @@ -504,10 +522,14 @@ void Convolution::getSupportedDescriptors() { } MemoryDescPtr in_candidate, out_candidate; - memory::format_tag nspc = ndims == 3 ? memory::format_tag::nwc : (ndims == 4 ? memory::format_tag::nhwc : memory::format_tag::ndhwc); - memory::format_tag ncsp = ndims == 3 ? memory::format_tag::ncw : (ndims == 4 ? memory::format_tag::nchw : memory::format_tag::ncdhw); - memory::format_tag nCsp8c = ndims == 3 ? memory::format_tag::nCw8c : (ndims == 4 ? memory::format_tag::nChw8c : memory::format_tag::nCdhw8c); - memory::format_tag nCsp16c = ndims == 3 ? memory::format_tag::nCw16c : (ndims == 4 ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c); + memory::format_tag nspc = + ndims == 3 ? memory::format_tag::nwc : (ndims == 4 ? memory::format_tag::nhwc : memory::format_tag::ndhwc); + memory::format_tag ncsp = + ndims == 3 ? memory::format_tag::ncw : (ndims == 4 ? memory::format_tag::nchw : memory::format_tag::ncdhw); + memory::format_tag nCsp8c = ndims == 3 ? memory::format_tag::nCw8c + : (ndims == 4 ? memory::format_tag::nChw8c : memory::format_tag::nCdhw8c); + memory::format_tag nCsp16c = ndims == 3 ? memory::format_tag::nCw16c + : (ndims == 4 ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c); if (canBeExecutedInInt8()) { DEBUG_LOG(getName(), "Creating I8 descriptor"); @@ -522,7 +544,7 @@ void Convolution::getSupportedDescriptors() { in_candidate = std::make_shared(getInputShapeAtPort(0), inputDataType, nspc); out_candidate = std::make_shared(getOutputShapeAtPort(0), outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); return; } @@ -547,7 +569,7 @@ void Convolution::getSupportedDescriptors() { eltwisePrecision = ov::element::f32; for (size_t i = 0; i < fusedWith.size(); i++) { if (fusedWith[i]->getAlgorithm() == Algorithm::EltwiseAdd) { - auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); + auto* eltwiseNode = dynamic_cast(fusedWith[i].get()); if (eltwiseNode && eltwiseNode->isSpecialConvolutionAddFusing()) { eltwisePrecision = fusedEltwisePrecision(fusedWith[i]); // TODO(amalyshe): there might be situation when convolution can be executed in BF16, @@ -579,42 +601,44 @@ void Convolution::getSupportedDescriptors() { #if defined(OPENVINO_ARCH_X86_64) // nspc shows better performance only with brgconv implementation - bool nspcFirst = isBrgConvAvailable() && one_of(inputDataType, memory::data_type::f16, memory::data_type::bf16, memory::data_type::f32); + bool nspcFirst = isBrgConvAvailable() && + one_of(inputDataType, memory::data_type::f16, memory::data_type::bf16, memory::data_type::f32); bool nspcAdded = false; if (nspcFirst) { in_candidate = std::make_shared(inputShape, inputDataType, nspc); out_candidate = std::make_shared(outputShape, outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); nspcAdded = true; } if (IC == 1 && groupOC == 1) { in_candidate = std::make_shared(inputShape, inputDataType, ncsp); out_candidate = std::make_shared(outputShape, outputDataType, ncsp); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } else if (IC < 4) { in_candidate = std::make_shared(inputShape, inputDataType, ncsp); out_candidate = std::make_shared(outputShape, outputDataType, nCsp16c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); out_candidate = std::make_shared(outputShape, outputDataType, nCsp8c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } else { in_candidate = std::make_shared(inputShape, inputDataType, nCsp16c); out_candidate = std::make_shared(outputShape, outputDataType, nCsp16c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); in_candidate = std::make_shared(inputShape, inputDataType, nCsp8c); out_candidate = std::make_shared(outputShape, outputDataType, nCsp8c); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } in_candidate = std::make_shared(inputShape, inputDataType, ncsp); out_candidate = std::make_shared(outputShape, outputDataType, ncsp); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); - if (!nspcAdded && (inputDataType != memory::data_type::bf16 && inputDataType != memory::data_type::f16 && isNspcAvailable())) { + if (!nspcAdded && + (inputDataType != memory::data_type::bf16 && inputDataType != memory::data_type::f16 && isNspcAvailable())) { in_candidate = std::make_shared(inputShape, inputDataType, nspc); out_candidate = std::make_shared(outputShape, outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); } #else (void)ncsp; @@ -623,7 +647,7 @@ void Convolution::getSupportedDescriptors() { in_candidate = std::make_shared(inputShape, inputDataType, nspc); out_candidate = std::make_shared(outputShape, outputDataType, nspc); - createDescriptor({ in_candidate }, { out_candidate }); + createDescriptor({in_candidate}, {out_candidate}); #endif } @@ -634,9 +658,11 @@ void Convolution::setPostOps(dnnl::primitive_attr& attr, dnnl::post_ops ops; auto& args = convPostOpsArgs[useLegacyPostOps]; bool isINT8 = canBeExecutedInInt8(); - // Weight dims in NON-Group CONV: [OC, IC, KH, KW], perchannel weight scale applied on OC DIM, weiScaleMaskPerChannel = 1 << 0 - // Weight dims in Group CONV:[Group, OC, IC, KH, KW], perchannel weight scale applied on GROUP and OC DIM, weiScaleMaskPerChannel = ( 1 << 0 | 1<< 1) = 0x03 - DnnlPostOpsComposerLegacy dnnlpoc(getEngine(), attr, ops, args, dims, 1, isINT8, isGrouped ? 3 : 1 << 0, getDQScales(), withBiases); + // Weight dims in NON-Group CONV: [OC, IC, KH, KW], perchannel weight scale applied on OC DIM, + // weiScaleMaskPerChannel = 1 << 0 Weight dims in Group CONV:[Group, OC, IC, KH, KW], perchannel weight scale + // applied on GROUP and OC DIM, weiScaleMaskPerChannel = ( 1 << 0 | 1<< 1) = 0x03 + DnnlPostOpsComposerLegacy + dnnlpoc(getEngine(), attr, ops, args, dims, 1, isINT8, isGrouped ? 3 : 1 << 0, getDQScales(), withBiases); DEBUG_LOG(getName(), " useLegacyPostOps=", useLegacyPostOps, " initWeights=", initWeights); @@ -679,14 +705,14 @@ void Convolution::setPostOps(dnnl::primitive_attr& attr, bool hasSubsequentSum = false; bool hasSubsequentFQ = false; for (size_t j = i + 1; j < fusedWith.size(); j++) { - auto &nextNode = fusedWith[j]; + auto& nextNode = fusedWith[j]; - auto *nextEltwiseNode = dynamic_cast(nextNode.get()); + auto* nextEltwiseNode = dynamic_cast(nextNode.get()); if (nextEltwiseNode && nextEltwiseNode->isSpecialConvolutionAddFusing()) { hasSubsequentSum = true; } - auto *nextQuantizeNode = dynamic_cast(nextNode.get()); + auto* nextQuantizeNode = dynamic_cast(nextNode.get()); if (nextQuantizeNode) { hasSubsequentFQ = true; } @@ -779,12 +805,16 @@ void Convolution::initSupportedPrimitiveDescriptors() { const std::vector dwWeightsDims{dw_conv_oc, 1, 1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]}; const std::vector dwBiasesDims{dw_conv_oc}; - const auto dwWeightsPrc = DnnlExtensionUtils::ElementTypeToDataType(dw_conv_in_dt == dnnl_u8 ? ov::element::i8 : ov::element::f32); - const auto dwWeightsDesc = std::make_shared(Shape(dwWeightsDims), dwWeightsPrc, memory::format_tag::Goihw8g); + const auto dwWeightsPrc = DnnlExtensionUtils::ElementTypeToDataType( + dw_conv_in_dt == dnnl_u8 ? ov::element::i8 : ov::element::f32); + const auto dwWeightsDesc = std::make_shared(Shape(dwWeightsDims), + dwWeightsPrc, + memory::format_tag::Goihw8g); inConfs.emplace_back(dwWeightsDesc); const auto dwBiasPrc = memory::data_type::f32; - const auto dwBiasDesc = std::make_shared(Shape(dwBiasesDims), dwBiasPrc, memory::format_tag::x); + const auto dwBiasDesc = + std::make_shared(Shape(dwBiasesDims), dwBiasPrc, memory::format_tag::x); inConfs.emplace_back(dwBiasDesc); } @@ -807,15 +837,25 @@ void Convolution::initSupportedPrimitiveDescriptors() { }; #ifdef CPU_DEBUG_CAPS { - if (!customImplPriorities.empty()) { - DEBUG_LOG("#", getName(), " customImplPriorities [", 0 , "/", customImplPriorities.size(), - "]: ", impl_type_to_string(customImplPriorities[0])); - } + if (!customImplPriorities.empty()) { + DEBUG_LOG("#", + getName(), + " customImplPriorities [", + 0, + "/", + customImplPriorities.size(), + "]: ", + impl_type_to_string(customImplPriorities[0])); + } } #endif for (size_t dIdx = 0; dIdx < descs.size(); dIdx++) { auto& desc = descs[dIdx]; - auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(desc.get())); + auto primitive_desc = desc.get(true); // true mean allow empty + if (primitive_desc == nullptr) { + continue; + } + auto first_desc = dnnl::primitive_desc(DnnlExtensionUtils::clone_primitive_desc(primitive_desc)); auto add_supported_desc = [&](dnnl::primitive_desc& desc) { addSupportedPrimitiveDescriptor(desc); @@ -823,16 +863,25 @@ void Convolution::initSupportedPrimitiveDescriptors() { }; const bool first_match = customImplPriorities.empty(); - DEBUG_LOG("#", getName(), - ", itpd.impl_info_str(): ", desc.impl_info_str(), - ", parsed imp_type: ", impl_type_to_string(parse_impl_name(desc.impl_info_str())), - ", first_match: ", first_match ? "true" : "false"); - DnnlExtensionUtils::for_each_implementation(desc, - first_match, - [&](impl_desc_type implType) { - return contains(getImplPriority(), implType); - }, - add_supported_desc); + DEBUG_LOG("#", + getName(), + ",descIndex:", + dIdx + 1, + "/", + descs.size(), + ", itpd.impl_info_str(): ", + desc.impl_info_str(), + ", parsed imp_type: ", + impl_type_to_string(parse_impl_name(desc.impl_info_str())), + ", first_match: ", + first_match ? "true" : "false"); + DnnlExtensionUtils::for_each_implementation( + desc, + first_match, + [&](impl_desc_type implType) { + return contains(getImplPriority(), implType); + }, + add_supported_desc); // fallback. if none of the primitive types is present in the priority list just add first implementation // @todo this fallback is not necessary if primitive priority list is filled correctly @@ -846,46 +895,48 @@ bool Convolution::created() const { } namespace { -dnnl::convolution_forward::primitive_desc -createDescriptorInternal(const dnnl::engine& engine, - const dnnl::memory::desc& inputDesc, - const dnnl::memory::desc& weightDesc, - const dnnl::memory::desc& biasDesc, - const dnnl::memory::desc& outputDesc, - bool withBiases, - const std::vector& stride, - const std::vector& dilation, - const std::vector& paddingL, - const std::vector& paddingR, - dnnl::algorithm alg, - const dnnl::primitive_attr& attr) { +dnnl::convolution_forward::primitive_desc createDescriptorInternal(const dnnl::engine& engine, + const dnnl::memory::desc& inputDesc, + const dnnl::memory::desc& weightDesc, + const dnnl::memory::desc& biasDesc, + const dnnl::memory::desc& outputDesc, + bool withBiases, + const std::vector& stride, + const std::vector& dilation, + const std::vector& paddingL, + const std::vector& paddingR, + dnnl::algorithm alg, + const dnnl::primitive_attr& attr) { if (withBiases) { - return dnnl::convolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - alg, - inputDesc, weightDesc, biasDesc, outputDesc, - dnnl::memory::dims(stride.begin(), stride.end()), - dnnl::memory::dims(dilation.begin(), dilation.end()), - dnnl::memory::dims(paddingL.begin(), paddingL.end()), - dnnl::memory::dims(paddingR.begin(), paddingR.end()), - attr, - true); // allow_empty + return dnnl::convolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + alg, + inputDesc, + weightDesc, + biasDesc, + outputDesc, + dnnl::memory::dims(stride.begin(), stride.end()), + dnnl::memory::dims(dilation.begin(), dilation.end()), + dnnl::memory::dims(paddingL.begin(), paddingL.end()), + dnnl::memory::dims(paddingR.begin(), paddingR.end()), + attr, + true); // allow_empty } else { - return dnnl::convolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - alg, - inputDesc, weightDesc, outputDesc, - dnnl::memory::dims(stride.begin(), stride.end()), - dnnl::memory::dims(dilation.begin(), dilation.end()), - dnnl::memory::dims(paddingL.begin(), paddingL.end()), - dnnl::memory::dims(paddingR.begin(), paddingR.end()), - attr, - true); // allow_empty + return dnnl::convolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + alg, + inputDesc, + weightDesc, + outputDesc, + dnnl::memory::dims(stride.begin(), stride.end()), + dnnl::memory::dims(dilation.begin(), dilation.end()), + dnnl::memory::dims(paddingL.begin(), paddingL.end()), + dnnl::memory::dims(paddingR.begin(), paddingR.end()), + attr, + true); // allow_empty } } -} // namespace +} // namespace static memory::data_type deriveWeightDataType(memory::data_type src_dt) { memory::data_type wdt = src_dt; @@ -910,7 +961,7 @@ void Convolution::createDescriptor(const std::vector& inputDesc, if (outputDesc[0]->isDefined()) { definedOutMemDesc = MemoryDescUtils::convertToDnnlMemoryDesc(outputDesc[0]); } else { - std::vector shapes = { definedInpMemDesc->getShape(), Shape(weightDims) }; + std::vector shapes = {definedInpMemDesc->getShape(), Shape(weightDims)}; auto outDims = shapeInferGeneric(shapes); definedOutMemDesc = MemoryDescUtils::convertToDnnlMemoryDesc(outputDesc[0]->cloneWithNewDims(outDims.front())); } @@ -924,13 +975,14 @@ void Convolution::createDescriptor(const std::vector& inputDesc, dnnl::memory::desc biasDnnlDesc; if (withBiases) { - //oneDNN ARM Convolution primitive supports only identical in/out data types + // oneDNN ARM Convolution primitive supports only identical in/out data types #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) memory::data_type bdt = outDnnlDesc.get_data_type(); #else memory::data_type bdt = memory::data_type::f32; #endif - biasDnnlDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); + biasDnnlDesc = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); } std::vector algorithms; @@ -942,10 +994,18 @@ void Convolution::createDescriptor(const std::vector& inputDesc, for (const auto alg : algorithms) { for (const auto& attr : attrs) { const auto desc = createDescriptorInternal(getEngine(), - inDnnlDesc, weightDnnlDesc, biasDnnlDesc, outDnnlDesc, withBiases, - stride, dilation, paddingL, paddingR, alg, attr); - if (desc) - descs.emplace_back(desc); + inDnnlDesc, + weightDnnlDesc, + biasDnnlDesc, + outDnnlDesc, + withBiases, + stride, + dilation, + paddingL, + paddingR, + alg, + attr); + descs.emplace_back(desc); } } } @@ -978,7 +1038,8 @@ void Convolution::addLegacyZeroPoints(dnnl::primitive_attr& attr) { if (!legacyWeightsZeroPointsMemPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::f32, {legacyWeightsZeroPoints.size()}); - legacyWeightsZeroPointsMemPtr = std::make_shared(getEngine(), memoryDesc, legacyWeightsZeroPoints.data()); + legacyWeightsZeroPointsMemPtr = + std::make_shared(getEngine(), memoryDesc, legacyWeightsZeroPoints.data()); } } @@ -988,7 +1049,8 @@ void Convolution::addLegacyZeroPoints(dnnl::primitive_attr& attr) { if (!legacyOutputCompensationMemPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::i32, {legacyOutputCompensation.size()}); - legacyOutputCompensationMemPtr = std::make_shared(getEngine(), memoryDesc, legacyOutputCompensation.data()); + legacyOutputCompensationMemPtr = + std::make_shared(getEngine(), memoryDesc, legacyOutputCompensation.data()); } } } @@ -999,7 +1061,7 @@ static bool attrContainsPostOp(const dnnl::primitive_attr& attr, const dnnl::imp } // See the src/plugins/intel_cpu/src/docs/convPostOps.md for details -void Convolution::SetPostOpsAndZeroPoints(std::vector &attrs) { +void Convolution::SetPostOpsAndZeroPoints(std::vector& attrs) { attrs.resize(1); auto outputShape = outputStaticShape(); // attr[0] - Legacy post ops + Legacy zero points. @@ -1007,14 +1069,13 @@ void Convolution::SetPostOpsAndZeroPoints(std::vector &att setPostOps(attrs[0], outputShape, true); addLegacyZeroPoints(attrs[0]); - //dw-conv would be fused into conv only on AVX2 platform. no need attr[1]. Avoid extra useless attribute. + // dw-conv would be fused into conv only on AVX2 platform. no need attr[1]. Avoid extra useless attribute. if (attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::convolution)) { return; } // no matter if brgconv is available, 1 attribute is enough. Avoid duplicated attribute - if (inputZeroPointType == zpType::None && - !attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::depthwise) && + if (inputZeroPointType == zpType::None && !attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::depthwise) && !attrContainsPostOp(attrs[0], dnnl::impl::primitive_kind::quantization)) { return; } @@ -1029,10 +1090,11 @@ void Convolution::SetPostOpsAndZeroPoints(std::vector &att } // Try 2 attributes. attrs.resize(2); - if (inputZeroPointType == zpType::PerTensor && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { - //WR to ONEDNN limitation. attr[1] - legacy post ops + stock zero point. + if (inputZeroPointType == zpType::PerTensor && + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) { + // WR to ONEDNN limitation. attr[1] - legacy post ops + stock zero point. //@todo:Unify to use binary postops+stock zero point when limitation is fixed. - //For now, have to adapt to JIT_AMX kernel for performance. + // For now, have to adapt to JIT_AMX kernel for performance. DEBUG_LOG(getName(), ": set post ops, attr 1, useLegacyPostOps=true"); setPostOps(attrs[1], outputShape, true); } else { @@ -1043,7 +1105,7 @@ void Convolution::SetPostOpsAndZeroPoints(std::vector &att } void Convolution::initDescriptor(const NodeConfig& config) { - auto *selectedPD = getSelectedPrimitiveDescriptor(); + auto* selectedPD = getSelectedPrimitiveDescriptor(); if (!selectedPD) { return; @@ -1052,24 +1114,29 @@ void Convolution::initDescriptor(const NodeConfig& config) { // attr[0] for legacy post ops; // attr[1] is mostly for binaryPostops except when having per-tensor zp on AMX. const int descId = descIdx[selectedPrimitiveDescriptorIndex]; - int attrId = attrs.size() == 1 ? 0 : - descId % 2 == 0 ? 0 : 1; + int attrId = attrs.size() == 1 ? 0 : descId % 2 == 0 ? 0 : 1; preferLegacyPostOps = (attrId == 0 || (attrId == 1 && (inputZeroPointType == zpType::PerTensor) && - dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx))); - //attr[0] for legacy zero point. - //attr[1] for stock per-tensor zero point. + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx))); + // attr[0] for legacy zero point. + // attr[1] for stock per-tensor zero point. preferLegacyZeroPoint = (attrId == 0); DEBUG_LOG(getName(), - " selectedPrimitiveDescriptorIndex: ", selectedPrimitiveDescriptorIndex, - " DescIdx: ", descId, - " Selected impl type: ", selectedPD->getImplementationType(), - " Desc impl type: ", parse_impl_name(descs[descId].impl_info_str()), - " preferLegacyPostOps: ", preferLegacyPostOps, - " preferLegacyZeroPoint: ", preferLegacyZeroPoint); - - auto updateNodeConfig = [&](const NodeConfig& cfg){ + " selectedPrimitiveDescriptorIndex: ", + selectedPrimitiveDescriptorIndex, + " DescIdx: ", + descId, + " Selected impl type: ", + selectedPD->getImplementationType(), + " Desc impl type: ", + parse_impl_name(descs[descId].impl_info_str()), + " preferLegacyPostOps: ", + preferLegacyPostOps, + " preferLegacyZeroPoint: ", + preferLegacyZeroPoint); + + auto updateNodeConfig = [&](const NodeConfig& cfg) { auto updatedConfig = cfg; for (size_t i = 0; i < descInputNumbers(); i++) { @@ -1092,7 +1159,7 @@ void Convolution::initDescriptor(const NodeConfig& config) { return updatedConfig; }; - if (!canBeExecutedInInt8()) { // strided blobs are suppoted only for FP32 convolutions + if (!canBeExecutedInInt8()) { // strided blobs are suppoted only for FP32 convolutions descs.clear(); createDescriptor({config.inConfs[0].getMemDesc()}, {config.outConfs[0].getMemDesc()}); @@ -1110,7 +1177,7 @@ void Convolution::initDescriptor(const NodeConfig& config) { selectedPD->setConfig(updatedConfig); } -std::shared_ptr Convolution::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +std::shared_ptr Convolution::getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (idx == 1) { // report original plain layout for weight since it needs to be reordered dynamically at runtime return std::make_shared(getOriginalInputPrecisionAtPort(idx), @@ -1146,7 +1213,8 @@ ov::element::Type Convolution::getRuntimePrecision() const { for (size_t i = 0; i < std::min(getParentEdges().size(), inputsNumLimit); i++) { auto parentEdge = getParentEdgeAt(i); if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated) { - inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); + inputPrecisions.emplace_back( + DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); } } @@ -1178,8 +1246,9 @@ bool Convolution::isNspcAvailable() const { return false; } } else { - // it was empirically observed that the nspc convolutions perform much slower than the blocked ones if the channels number more than the specific value - size_t spatialRank = ndims - 2; //two means batch dim plus channels dim + // it was empirically observed that the nspc convolutions perform much slower than the blocked ones if the + // channels number more than the specific value + size_t spatialRank = ndims - 2; // two means batch dim plus channels dim bool is1x1 = false; @@ -1190,24 +1259,24 @@ bool Convolution::isNspcAvailable() const { auto paddingRreversItr = paddingR.crbegin(); for (size_t i = 0; i < spatialRank; ++i) { - is1x1 = true - && *(weightDimsReversItr++) == 1 - && *(strideReversItr++) == 1 - && *(paddingLreversItr++) == 0 - && *(paddingRreversItr++) == 0; + is1x1 = true && *(weightDimsReversItr++) == 1 && *(strideReversItr++) == 1 && + *(paddingLreversItr++) == 0 && *(paddingRreversItr++) == 0; } } - // if the activation field size is 1x1 the avx512 1x1 nspc convolution pollutes caches so that the layer after the convolution performs slow + // if the activation field size is 1x1 the avx512 1x1 nspc convolution pollutes caches so that the layer after + // the convolution performs slow if (mayiuse(impl::cpu::x64::avx512_core) && is1x1) { auto end = inpDims.rbegin(); std::advance(end, spatialRank); - if (std::all_of(inpDims.rbegin(), end, [](size_t x) { return dimsEqualStrong(1, x); })) { + if (std::all_of(inpDims.rbegin(), end, [](size_t x) { + return dimsEqualStrong(1, x); + })) { return false; } } - unsigned thresholdNumChannels = 128u; // for avx and below + unsigned thresholdNumChannels = 128u; // for avx and below if (is1x1) { thresholdNumChannels = 2048u; } else if (mayiuse(impl::cpu::x64::avx512_core)) { @@ -1219,7 +1288,8 @@ bool Convolution::isNspcAvailable() const { return false; } if (!mayiuse(impl::cpu::x64::avx)) { - // SSE41 nspc convolutions do not support ic and oc tails yet and the blocked implementation will be much better than gemm + // SSE41 nspc convolutions do not support ic and oc tails yet and the blocked implementation will be much + // better than gemm if ((IC % 8) || (OC % 8)) { return false; } @@ -1246,7 +1316,7 @@ void Convolution::prepareParams() { OPENVINO_THROW("Input memory is undefined."); } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) OPENVINO_THROW("Preferable primitive descriptor is not set for node ", getName(), "."); @@ -1319,44 +1389,41 @@ void Convolution::prepareParams() { dnnlBiasDesc = biasDescPtr->getDnnlDesc(); } - return createDescriptorInternal( - engine, - srcDesc, - wghDesc, - dnnlBiasDesc, - dstDesc, - (biasDescPtr != nullptr), - stride, - dilation, - paddingL, - paddingR, - alg, - attr); + return createDescriptorInternal(engine, + srcDesc, + wghDesc, + dnnlBiasDesc, + dstDesc, + (biasDescPtr != nullptr), + stride, + dilation, + paddingL, + paddingR, + alg, + attr); }; - dnnl::primitive_desc prim_desc = createDnnlConvDesc( - engine, - key.inp0->getDnnlDesc(), - wghDescAny, - key.out->getDnnlDesc(), - key.bias, - key.stride, - key.dilation, - key.paddingL, - key.paddingR, - convAlg, - key.attr); + dnnl::primitive_desc prim_desc = createDnnlConvDesc(engine, + key.inp0->getDnnlDesc(), + wghDescAny, + key.out->getDnnlDesc(), + key.bias, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + convAlg, + key.attr); const bool found = DnnlExtensionUtils::find_implementation(prim_desc, key.implType); if (found) { - return std::make_shared( - prim_desc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + return std::make_shared(prim_desc, + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); } // primitive desc with proper implementation type not found, use the first available @@ -1367,40 +1434,37 @@ void Convolution::prepareParams() { key.out->getDataType(), memory::format_tag::any); - auto reorderConvDesc = createDnnlConvDesc( - engine, - inDesc, - wghDescAny, - outDesc, - key.bias, - key.stride, - key.dilation, - key.paddingL, - key.paddingR, - convAlg, - key.attr); + auto reorderConvDesc = createDnnlConvDesc(engine, + inDesc, + wghDescAny, + outDesc, + key.bias, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + convAlg, + key.attr); // unable to create a primitive desc if (!reorderConvDesc) return nullptr; if (key.attr.get()->post_ops_.count(dnnl::impl::primitive_kind::sum)) { - return std::make_shared( - reorderConvDesc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + return std::make_shared(reorderConvDesc, + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); } - return std::make_shared( - reorderConvDesc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + return std::make_shared(reorderConvDesc, + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); }; auto prevExecPtr = execPtr; @@ -1455,7 +1519,8 @@ Convolution::ConvolutionExecutor::ConvolutionExecutor(const dnnl::primitive_desc const dnnl::memory::desc& weightMemDesc, const dnnl::memory::desc& outMemDesc, const dnnl::engine& engine, - bool constWeight) : DnnlExecutor(pd) { + bool constWeight) + : DnnlExecutor(pd) { if (inMemDesc != getDnnlSrcDesc()) { inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)}); } @@ -1475,7 +1540,8 @@ Convolution::ConvolutionSumExecutor::ConvolutionSumExecutor(const dnnl::primitiv const dnnl::memory::desc& weightMemDesc, const dnnl::memory::desc& outMemDesc, const dnnl::engine& engine, - bool constWeight) : DnnlExecutor(pd) { + bool constWeight) + : DnnlExecutor(pd) { if (inMemDesc != getDnnlSrcDesc()) { inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)}); } @@ -1493,9 +1559,10 @@ Convolution::ConvolutionSumExecutor::ConvolutionSumExecutor(const dnnl::primitiv } } -void Convolution::ConvolutionSumExecutor::reorder_exec(std::unordered_map primArgs, dnnl::stream strm) { +void Convolution::ConvolutionSumExecutor::reorder_exec(std::unordered_map primArgs, + dnnl::stream strm) { auto outputMem = primArgs.at(DNNL_ARG_DST); - for (auto &inReorder : inputReorders) { + for (auto& inReorder : inputReorders) { if (primArgs.count(inReorder.first)) { dnnl::memory memDst(inReorder.second.getDstDesc(), strm.get_engine()); inReorder.second.exec(primArgs[inReorder.first], memDst, strm); @@ -1544,14 +1611,14 @@ void Convolution::executeDynamicImpl(dnnl::stream strm) { } void Convolution::updatePadding() { - //update padding. + // update padding. if (isDynamicNode() && autoPadding) { paddingL = shapeInference->get_pads_begin(); paddingR = shapeInference->get_pads_end(); } } -void Convolution::redefineOutputMemory(const std::vector &newOutputShapes) { +void Convolution::redefineOutputMemory(const std::vector& newOutputShapes) { if (withSum) { const size_t sumPortNum = getParentEdges().size() - 1; const auto& sumInpMem = getParentEdgeAt(sumPortNum)->getMemory(); @@ -1565,7 +1632,8 @@ void Convolution::redefineOutputMemory(const std::vector &newOutputS auto inp1 = subgraph->getInput(1); inp1->redefineOutputMemory({sumInpMem.getStaticDims()}); - // here we postpone output memory reallocation due to the fact that it is the same memory with the sum second input + // here we postpone output memory reallocation due to the fact that it is the same memory with the sum + // second input return; } else { withSumBroadcast = false; @@ -1574,12 +1642,10 @@ void Convolution::redefineOutputMemory(const std::vector &newOutputS Node::redefineOutputMemory(newOutputShapes); } -MemoryDescPtr Convolution::getSumMemDesc(const primitive_desc &primitive_desc_it) { +MemoryDescPtr Convolution::getSumMemDesc(const primitive_desc& primitive_desc_it) { if (getOutputShapeAtPort(0).isDynamic()) { - // When we set input shape with ranged dims, sum node input shape maybe mismatch with output shape, we just change - // ranged min value to 1 to meet this case. - // For example: - // Output shape = {1, 160, {128, 256}, {128, 256}} + // When we set input shape with ranged dims, sum node input shape maybe mismatch with output shape, we just + // change ranged min value to 1 to meet this case. For example: Output shape = {1, 160, {128, 256}, {128, 256}} // Sum input shape = {1, 160, 1, 1} // Update sum shape to {1, 160, {1, 256}, {1, 256}} auto shape = getOutputShapeAtPort(0); @@ -1617,7 +1683,7 @@ MemoryPtr Convolution::getOutputMemory() const { } } -void Convolution::addFusedNode(const NodePtr &fusingNode) { +void Convolution::addFusedNode(const NodePtr& fusingNode) { if (Type::Eltwise == fusingNode->getType()) { if (fusingNode->getAlgorithm() == Algorithm::EltwiseAdd) { auto eltwiseNode = std::dynamic_pointer_cast(fusingNode); @@ -1650,7 +1716,6 @@ void Convolution::appendLegacyZeroPointsArgs() { } } - void Convolution::appendZeroPointsArgs() { if (stockInputZeroPointsMemPtr != nullptr) { primArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = stockInputZeroPointsMemPtr->getPrimitive(); @@ -1668,10 +1733,9 @@ void Convolution::initializeInputZeroPoints(const uint8_t* inputZpData, const si inputZeroPointType = zpType::PerChannel; } // Only enable per-tensor zero point on avx512-amx and avx512-core-vnni, avx2_vnni_2. - // avx2_vnni is not enabled per-tensor z because of perf regression brgconv with per-tensor zpcompared with jit per-channel zp - // If zero point is pertensor, both legacy zp and stock zp - // would be passed into conv node. The conv node would determine how to create - // post-ops attribute and prioritize to choose final onednn kernel. + // avx2_vnni is not enabled per-tensor z because of perf regression brgconv with per-tensor zpcompared with jit + // per-channel zp If zero point is pertensor, both legacy zp and stock zp would be passed into conv node. The conv + // node would determine how to create post-ops attribute and prioritize to choose final onednn kernel. if (inputZeroPointType == zpType::PerTensor && (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_vnni) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx2_vnni_2))) @@ -1689,15 +1753,14 @@ VectorDims Convolution::makeInputDummyShape(const Shape& inpShape) const { const size_t filterStartIndx = weightDims.size() - spatialRank; VectorDims dummyInputShapeVals(inpShape.getRank(), dummyInputDim); - dummyInputShapeVals[1] = IC; //channels + dummyInputShapeVals[1] = IC; // channels for (size_t i = 0; i < spatialRank; i++) { if (weightDims[filterStartIndx + i] > dummyInputShapeVals[2 + i]) { constexpr Dim dummyOutputDim = 16; - dummyInputShapeVals[2 + i] = (dummyOutputDim - 1) * stride[i] - - (paddingL[i] + paddingR[i]) + - weightDims[filterStartIndx + i] + - (weightDims[filterStartIndx + i]- 1) * (dilation[i]); + dummyInputShapeVals[2 + i] = (dummyOutputDim - 1) * stride[i] - (paddingL[i] + paddingR[i]) + + weightDims[filterStartIndx + i] + + (weightDims[filterStartIndx + i] - 1) * (dilation[i]); } } return MemoryDescUtils::makeDummyShape(inpShape, dummyInputShapeVals).getStaticDims(); @@ -1707,12 +1770,12 @@ VectorDims Convolution::outputStaticShape() const { auto& outputShape = getOutputShapeAtPort(0); if (outputShape.isDynamic()) { auto inpDummyShape = makeInputDummyShape(getInputShapeAtPort(0)); - auto outputDims = shapeInferGeneric({ Shape(inpDummyShape), Shape(weightDims) }); + auto outputDims = shapeInferGeneric({Shape(inpDummyShape), Shape(weightDims)}); return Shape(outputDims.front()).getStaticDims(); } return outputShape.getStaticDims(); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/conv.h b/src/plugins/intel_cpu/src/nodes/conv.h index a7cac9bced1241..8da3193e5760cf 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.h +++ b/src/plugins/intel_cpu/src/nodes/conv.h @@ -29,7 +29,7 @@ class Convolution : public Node { return false; } ov::element::Type getRuntimePrecision() const override; - std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; dnnl::memory getWeights() const; dnnl::memory getBias() const; @@ -39,23 +39,35 @@ class Convolution : public Node { } bool canBeExecutedInInt8() const override; - size_t getGroupNum() const { return groupNum; } - //OV Legacy input zero point mechanism can support per-channel zero point. - //Hold legacy input zero point. + size_t getGroupNum() const { + return groupNum; + } + // OV Legacy input zero point mechanism can support per-channel zero point. + // Hold legacy input zero point. std::vector legacyInputZeroPoints; - //Hold legacy weight zero point. + // Hold legacy weight zero point. std::vector legacyWeightsZeroPoints; - //Hold legacy pre-calculated output compensation + // Hold legacy pre-calculated output compensation std::vector legacyOutputCompensation; - //Hold stock per-tensor input zero point. Pass to onednn to calculate output compensation. + // Hold stock per-tensor input zero point. Pass to onednn to calculate output compensation. std::vector inputZeroPoints; void initializeInputZeroPoints(const uint8_t* inputZpData, const size_t inputZpSize); - const VectorDims &getWeightDims() { return weightDims; } - const std::vector &getStride() { return stride; } - const std::vector &getDilation() { return dilation; } - const std::vector &getPaddingL() { return paddingL; } - const std::vector &getPaddingR() { return paddingR; } + const VectorDims& getWeightDims() { + return weightDims; + } + const std::vector& getStride() { + return stride; + } + const std::vector& getDilation() { + return dilation; + } + const std::vector& getPaddingL() { + return paddingL; + } + const std::vector& getPaddingR() { + return paddingR; + } bool canFuse(const NodePtr& node) const override; bool isDepthWise() const { @@ -64,16 +76,12 @@ class Convolution : public Node { protected: ov::element::Type fusedEltwisePrecision(const NodePtr& fusingNode) const; - void redefineOutputMemory(const std::vector &newOutputShapes) override; - void addFusedNode(const NodePtr &fusingNode) override; + void redefineOutputMemory(const std::vector& newOutputShapes) override; + void addFusedNode(const NodePtr& fusingNode) override; const std::vector& getDefaultImplPriority() override; private: - enum class zpType { - None, - PerTensor, - PerChannel - }; + enum class zpType { None, PerTensor, PerChannel }; class FusedSubgraph; using FusedSubgraphPtr = std::shared_ptr; @@ -81,26 +89,26 @@ class Convolution : public Node { executorPtr execPtr = nullptr; class ConvolutionExecutor : public DnnlExecutor { - public: - ConvolutionExecutor(const dnnl::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight); + public: + ConvolutionExecutor(const dnnl::primitive_desc& pd, + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight); }; class ConvolutionSumExecutor : public DnnlExecutor { - public: - ConvolutionSumExecutor(const dnnl::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight); - - private: - void reorder_exec(std::unordered_map primArgs, dnnl::stream strm) override; + public: + ConvolutionSumExecutor(const dnnl::primitive_desc& pd, + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight); + + private: + void reorder_exec(std::unordered_map primArgs, dnnl::stream strm) override; }; void prepareParams() override; @@ -108,13 +116,16 @@ class Convolution : public Node { void executeDynamicImpl(dnnl::stream strm) override; void addLegacyZeroPoints(dnnl::primitive_attr& attr); void addZeroPoints(dnnl::primitive_attr& attr); - void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims, bool useLegacyPostOps, bool initWeights = false); - void SetPostOpsAndZeroPoints(std::vector &attrs); + void setPostOps(dnnl::primitive_attr& attr, + const VectorDims& dims, + bool useLegacyPostOps, + bool initWeights = false); + void SetPostOpsAndZeroPoints(std::vector& attrs); void filterSupportedDescriptors(); bool isNspcAvailable() const; void updatePadding(); - MemoryDescPtr getSumMemDesc(const dnnl::primitive_desc &primitive_desc_it); + MemoryDescPtr getSumMemDesc(const dnnl::primitive_desc& primitive_desc_it); MemoryPtr getOutputMemory() const; VectorDims makeInputDummyShape(const Shape& inpShape) const; VectorDims outputStaticShape() const; @@ -131,7 +142,7 @@ class Convolution : public Node { zpType inputZeroPointType = zpType::None; // maps each supportedPrimitiveDescriptor to corresponding desc from descs std::vector descIdx; - VectorDims expectedBiasDims {}; + VectorDims expectedBiasDims{}; std::vector stride; std::vector dilation; @@ -179,6 +190,6 @@ class Convolution : public Node { #endif }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/convert.cpp b/src/plugins/intel_cpu/src/nodes/convert.cpp index 1987c9cc83d5f2..d01a56aac1b86d 100644 --- a/src/plugins/intel_cpu/src/nodes/convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/convert.cpp @@ -26,7 +26,8 @@ bool Convert::isSupportedOperation(const std::shared_ptr& op, st auto srcPrc = op->get_input_element_type(0); auto dstPrc = op->get_output_element_type(0); if (!CommonConvertExecutor::isSupported(srcPrc, dstPrc)) { - errorMessage = "cpu_convert can't convert from: " + srcPrc.to_string() + " precision to: " + dstPrc.to_string(); + errorMessage = + "cpu_convert can't convert from: " + srcPrc.to_string() + " precision to: " + dstPrc.to_string(); return false; } } catch (...) { @@ -36,7 +37,7 @@ bool Convert::isSupportedOperation(const std::shared_ptr& op, st } Convert::Convert(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, PassThroughShapeInferFactory()) { + : Node(op, context, PassThroughShapeInferFactory()) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { errorPrefix = "Convert node with name '" + getName() + "'"; @@ -48,8 +49,11 @@ Convert::Convert(const std::shared_ptr& op, const GraphContext::CPtr c convertParams.origPrc = convert->get_destination_type(); } -Convert::Convert(const Shape &shape, const ov::element::Type &inPrc, const ov::element::Type &outPrc, - const std::string &nodeName, const GraphContext::CPtr context) +Convert::Convert(const Shape& shape, + const ov::element::Type& inPrc, + const ov::element::Type& outPrc, + const std::string& nodeName, + const GraphContext::CPtr context) : Node("Convert", {shape}, {shape}, {inPrc}, {outPrc}, nodeName, context) { convertParams.origPrc = outPrc; @@ -74,7 +78,7 @@ void Convert::getSupportedDescriptors() { OPENVINO_THROW(errorPrefix, " has incorrect number of output edges"); } -bool Convert::isSupportedDesc(const MemoryDesc &desc) { +bool Convert::isSupportedDesc(const MemoryDesc& desc) { bool isSupported = desc.getType() & MemoryDescType::Blocked; if (desc.getType() == MemoryDescType::DnnlBlocked) isSupported &= desc.as()->hasEmptyExtraData(); @@ -101,13 +105,16 @@ void Convert::initSupportedPrimitiveDescriptors() { MemoryDescPtr dstMemoryDesc = config.outConfs[0].getMemDesc(); convertParams.srcPrc = srcMemoryDesc->getPrecision(); convertParams.dstPrc = dstMemoryDesc->getPrecision(); - auto factory = std::make_shared(convertParams, srcMemoryDesc, dstMemoryDesc, - std::make_shared(context, getImplPriority())); + auto factory = + std::make_shared(convertParams, + srcMemoryDesc, + dstMemoryDesc, + std::make_shared(context, getImplPriority())); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, factory); }; - // if input and output pointers are not null and not contain extra data, then the inp/output tensor descriptors were set using setDescs method, so - // they should be used as the actual descriptors. + // if input and output pointers are not null and not contain extra data, then the inp/output tensor descriptors were + // set using setDescs method, so they should be used as the actual descriptors. if (canInitExternalDesc) { dataIn.setMemDesc(input); config.inConfs.push_back(dataIn); @@ -142,8 +149,10 @@ void Convert::initSupportedPrimitiveDescriptors() { : BlockedDescCreator::makeFilteredRange(creators, insShape.getRank()); for (auto itr = range.first; itr != range.second; ++itr) { - config.inConfs[0].setMemDesc(std::make_shared(itr->second->createDesc(insPrecision, insShape))); - config.outConfs[0].setMemDesc(std::make_shared(itr->second->createDesc(outPrecision, outputShape))); + config.inConfs[0].setMemDesc( + std::make_shared(itr->second->createDesc(insPrecision, insShape))); + config.outConfs[0].setMemDesc( + std::make_shared(itr->second->createDesc(outPrecision, outputShape))); supportedPrimitiveDescriptorsBuilder(config); } @@ -159,10 +168,8 @@ void Convert::prepareParams() { auto selectedPD = getSelectedPrimitiveDescriptor(); MemoryDescPtr srcDesc = getSrcMemoryAtPort(0)->getDescPtr(); MemoryDescPtr dstDesc = getDstMemoryAtPort(0)->getDescPtr(); - execPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(convertParams, - srcDesc, - dstDesc, - {}); + execPtr = + selectedPD->getExecutorFactoryAs()->makeExecutor(convertParams, srcDesc, dstDesc, {}); selectedPD->setImplementationType(execPtr->implType()); } @@ -189,6 +196,6 @@ bool Convert::created() const { return getType() == Type::Convert; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/convert.h b/src/plugins/intel_cpu/src/nodes/convert.h index 2a257bd1d31cd8..3bc911d118fd7a 100644 --- a/src/plugins/intel_cpu/src/nodes/convert.h +++ b/src/plugins/intel_cpu/src/nodes/convert.h @@ -14,8 +14,11 @@ namespace node { class Convert : public Node { public: Convert(const std::shared_ptr& op, const GraphContext::CPtr context); - Convert(const Shape &shape, const ov::element::Type &inPrc, const ov::element::Type &outPrc, - const std::string &nodeName, const GraphContext::CPtr context); + Convert(const Shape& shape, + const ov::element::Type& inPrc, + const ov::element::Type& outPrc, + const std::string& nodeName, + const GraphContext::CPtr context); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -28,22 +31,28 @@ class Convert : public Node { } // This is the interface extension designed to provide inp and output tensor descriptors without the CNNLayer. - // In that case the Convert node is instantiated with default CNNLayer and inp/out tensor descriptors are set via this method. - // This is useful if the Convert node is added to the graph as an auxiliary operation at the Graph + // In that case the Convert node is instantiated with default CNNLayer and inp/out tensor descriptors are set via + // this method. This is useful if the Convert node is added to the graph as an auxiliary operation at the Graph // initialization stage. void setDescs(const MemoryDesc& input, const MemoryDesc& output) { this->input = input.clone(); this->output = output.clone(); } - const MemoryDesc& getInput() const { return *input; } - const MemoryDesc& getOutput() const { return *output; } + const MemoryDesc& getInput() const { + return *input; + } + const MemoryDesc& getOutput() const { + return *output; + } - bool needPrepareParams() const override { return inputShapesModified(); } + bool needPrepareParams() const override { + return inputShapesModified(); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - static bool isSupportedDesc(const MemoryDesc &desc); + static bool isSupportedDesc(const MemoryDesc& desc); private: MemoryDescPtr input; @@ -55,6 +64,6 @@ class Convert : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp index 0b467fe452e061..2869d782cdb445 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.cpp @@ -2,18 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/ctc_greedy_decoder.hpp" + #include #include -#include "openvino/op/ctc_greedy_decoder.hpp" -#include "openvino/core/parallel.hpp" #include "ctc_greedy_decoder.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { namespace node { -bool CTCGreedyDecoder::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool CTCGreedyDecoder::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto greedyDecOp = ov::as_type_ptr(op); if (!greedyDecOp) { @@ -61,8 +63,7 @@ void CTCGreedyDecoder::initSupportedPrimitiveDescriptors() { if (!one_of(seqLenPrecision, ov::element::f32, ov::element::bf16, ov::element::f16)) OPENVINO_THROW(errorPrefix, "has unsupported 'sequence_length' input precision: ", seqLenPrecision); - addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -141,7 +142,7 @@ void CTCGreedyDecoder::execute(dnnl::stream strm) { } tStart = 0lu; } - }; // thread body + }; // thread body parallel_nt(0, threadBody); @@ -151,8 +152,7 @@ void CTCGreedyDecoder::execute(dnnl::stream strm) { const size_t sequenceLength = sequenceLengths[b]; float* shiftedOut = outputSequences + b * T; for (size_t t = 0; t < sequenceLength; ++t) { - if (*shiftedOut < blankIndex && - !(mergeRepeated && *shiftedOut == prevClassIdx)) { + if (*shiftedOut < blankIndex && !(mergeRepeated && *shiftedOut == prevClassIdx)) { outputSequences[outputIndex++] = *shiftedOut; } prevClassIdx = *shiftedOut; @@ -174,6 +174,6 @@ bool CTCGreedyDecoder::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h index 1f3179edb904d2..a552ff7db3c566 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder.h @@ -14,7 +14,7 @@ class CTCGreedyDecoder : public Node { public: CTCGreedyDecoder(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -22,6 +22,7 @@ class CTCGreedyDecoder : public Node { bool needPrepareParams() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + private: const size_t DATA_INDEX = 0lu; const size_t SEQUENCE_LENGTH_INDEX = 1lu; @@ -30,6 +31,6 @@ class CTCGreedyDecoder : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp index 63db3968094c3a..3eb02f2583e551 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.cpp @@ -2,18 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "ctc_greedy_decoder_seq_len.h" + +#include #include #include -#include #include "openvino/core/parallel.hpp" -#include "ctc_greedy_decoder_seq_len.h" namespace ov { namespace intel_cpu { namespace node { -bool CTCGreedyDecoderSeqLen::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool CTCGreedyDecoderSeqLen::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto greedyDecOp = ov::as_type_ptr(op); if (!greedyDecOp) { @@ -67,33 +69,35 @@ void CTCGreedyDecoderSeqLen::initSupportedPrimitiveDescriptors() { inDataConf.emplace_back(LayoutType::ncsp, ov::element::i32); addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::i32}, - {LayoutType::ncsp, ov::element::i32}}, + {{LayoutType::ncsp, ov::element::i32}, {LayoutType::ncsp, ov::element::i32}}, impl_desc_type::ref_any); } void CTCGreedyDecoderSeqLen::execute(dnnl::stream strm) { const float* probabilities = getSrcDataAtPortAs(DATA_INDEX); const int* sequenceLengths = getSrcDataAtPortAs(SEQUENCE_LENGTH_INDEX); - int* decodedClasses = getDstDataAtPortAs(DECODED_CLASSES_INDEX); + int* decodedClasses = getDstDataAtPortAs(DECODED_CLASSES_INDEX); int* decodedClassesLength = getDstDataAtPortAs(DECODED_CLASSES_LENGTH_INDEX); - const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0];; - const size_t T = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[1];; - const int C = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[2];; + const size_t B = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[0]; + ; + const size_t T = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[1]; + ; + const int C = getParentEdgeAt(DATA_INDEX)->getMemory().getStaticDims()[2]; + ; const size_t TC = T * C; int blankIndex = C - 1; if (inputShapes.size() > BLANK_INDEX) - blankIndex = (getSrcDataAtPortAs(BLANK_INDEX))[0]; + blankIndex = (getSrcDataAtPortAs(BLANK_INDEX))[0]; size_t workAmount = 0; for (size_t b = 0; b < B; b++) { if (sequenceLengths[b] > static_cast(T)) { - std::string errorMsg = errorPrefix - + ". Sequence length " + std::to_string(sequenceLengths[b]) - + " cannot be greater than according decoded classes dimension size " - + std::to_string(getChildEdgeAt(DECODED_CLASSES_INDEX)->getMemory().getStaticDims()[1]); + std::string errorMsg = + errorPrefix + ". Sequence length " + std::to_string(sequenceLengths[b]) + + " cannot be greater than according decoded classes dimension size " + + std::to_string(getChildEdgeAt(DECODED_CLASSES_INDEX)->getMemory().getStaticDims()[1]); OPENVINO_THROW(errorMsg); } workAmount += sequenceLengths[b]; @@ -142,7 +146,7 @@ void CTCGreedyDecoderSeqLen::execute(dnnl::stream strm) { } tStart = 0lu; } - }; // thread body + }; // thread body parallel_nt(0, threadBody); @@ -153,8 +157,7 @@ void CTCGreedyDecoderSeqLen::execute(dnnl::stream strm) { int* shiftedOut = decodedClasses + b * T; for (size_t t = 0; t < actualSeqLen; ++t) { - if (*shiftedOut != blankIndex && - !(mergeRepeated && *shiftedOut == prevClassIdx)) { + if (*shiftedOut != blankIndex && !(mergeRepeated && *shiftedOut == prevClassIdx)) { decodedClasses[outputIndex++] = *shiftedOut; } prevClassIdx = *shiftedOut; @@ -177,6 +180,6 @@ bool CTCGreedyDecoderSeqLen::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h index 4e7d14fd23556a..95ab8ef84b07eb 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h +++ b/src/plugins/intel_cpu/src/nodes/ctc_greedy_decoder_seq_len.h @@ -14,7 +14,7 @@ class CTCGreedyDecoderSeqLen : public Node { public: CTCGreedyDecoderSeqLen(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -34,6 +34,6 @@ class CTCGreedyDecoderSeqLen : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp index 6d09b0aea7e934..0ed3d95503eb62 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp +++ b/src/plugins/intel_cpu/src/nodes/ctc_loss.cpp @@ -2,11 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/ctc_loss.hpp" + #include -#include "openvino/op/ctc_loss.hpp" -#include "openvino/core/parallel.hpp" #include "ctc_loss.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -53,9 +54,7 @@ void CTCLoss::initSupportedPrimitiveDescriptors() { for (size_t i = 1; i < inputShapes.size(); ++i) inDataConf.emplace_back(LayoutType::ncsp, ov::element::i32); - addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::f32}}, - impl_desc_type::ref_any); + addSupportedPrimDesc(inDataConf, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } void CTCLoss::executeDynamicImpl(dnnl::stream strm) { @@ -71,7 +70,7 @@ void CTCLoss::execute(dnnl::stream strm) { const int* labelsLength = getSrcDataAtPortAs(3); float* dstData = getDstDataAtPortAs(0); - const auto &inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); + const auto& inDims = getParentEdgeAt(0)->getMemory().getStaticDims(); const size_t batchNum = inDims[0]; const size_t maxTime = inDims[1]; const size_t classesNum = inDims[2]; @@ -96,11 +95,11 @@ void CTCLoss::execute(dnnl::stream strm) { for (size_t b = start; b < end; b++) { if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > static_cast(maxTime) || labelsLength[b] > logitsLength[b]) { - errorMsgB[ithr] = errorPrefix + ". Logit length cannot be greater than max sequence length. " - + "Label length cannot be greater than a logit length" - + " and both cannot be negative.\nMaxSeqLen: " - + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b]) - + "; Label len: " + std::to_string(labelsLength[b]); + errorMsgB[ithr] = errorPrefix + ". Logit length cannot be greater than max sequence length. " + + "Label length cannot be greater than a logit length" + + " and both cannot be negative.\nMaxSeqLen: " + std::to_string(maxTime) + + "; Logit len: " + std::to_string(logitsLength[b]) + + "; Label len: " + std::to_string(labelsLength[b]); returnCode = -1; return; } @@ -151,8 +150,8 @@ void CTCLoss::execute(dnnl::stream strm) { for (size_t ll = 0; ll < actualLogitLen; ll++) { logProbabilities[ll].resize(decodedTargetLen); } - } // for batch - }; // threadBody_1 + } // for batch + }; // threadBody_1 parallel_nt(threads_num, threadBody_1); if (returnCode != 0) { @@ -211,7 +210,7 @@ void CTCLoss::execute(dnnl::stream strm) { } sT = 0lu; } // for batch - }; // threadBody_2 + }; // threadBody_2 parallel_nt(0, threadBody_2); @@ -236,8 +235,8 @@ void CTCLoss::execute(dnnl::stream strm) { if (start >= end) return; - // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: - // Graves et al., 2016, paragraph 4.1 (10) + // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural + // Networks: Graves et al., 2016, paragraph 4.1 (10) for (size_t b = start; b < end; b++) { auto& targetD = targetDB[b]; auto& logProbabilities = logProbabilitiesB[b]; @@ -250,21 +249,19 @@ void CTCLoss::execute(dnnl::stream strm) { for (int t = actualLogitLen - 2; t >= 0; t--) { const int t_1 = t + 1; for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t))); - s < std::min(decodedTargetLen, 2 * (t_1)); s++) { + s < std::min(decodedTargetLen, 2 * (t_1)); + s++) { if (ctcMergeRepeated || targetD[s] == blankIndex) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s][t_1] + logProbabilities[t_1][s]); + logBwd[s][t] = sumLogs(logBwd[s][t], logBwd[s][t_1] + logProbabilities[t_1][s]); } if (s + 1 < decodedTargetLen) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); + logBwd[s][t] = sumLogs(logBwd[s][t], logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); } if (s + 2 < decodedTargetLen) { if (targetD[s] != blankIndex && (!ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); + logBwd[s][t] = sumLogs(logBwd[s][t], logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); } } } @@ -274,8 +271,8 @@ void CTCLoss::execute(dnnl::stream strm) { logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0]; dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]); - } // for batch - }; // threadBody_3 + } // for batch + }; // threadBody_3 parallel_nt(0, threadBody_3); } @@ -284,6 +281,6 @@ bool CTCLoss::created() const { return getType() == Type::CTCLoss; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/ctc_loss.h b/src/plugins/intel_cpu/src/nodes/ctc_loss.h index a07d8f0fc59479..d1a66df3b92b89 100644 --- a/src/plugins/intel_cpu/src/nodes/ctc_loss.h +++ b/src/plugins/intel_cpu/src/nodes/ctc_loss.h @@ -14,7 +14,7 @@ class CTCLoss : public Node { public: CTCLoss(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -22,7 +22,9 @@ class CTCLoss : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; void executeDynamicImpl(dnnl::stream strm) override; - bool needPrepareParams() const override { return false; }; + bool needPrepareParams() const override { + return false; + }; private: bool ctcMergeRepeated; @@ -32,6 +34,6 @@ class CTCLoss : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/cum_sum.cpp b/src/plugins/intel_cpu/src/nodes/cum_sum.cpp index e411283e661585..43e69e29916430 100644 --- a/src/plugins/intel_cpu/src/nodes/cum_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/cum_sum.cpp @@ -3,15 +3,16 @@ // #include "cum_sum.h" + +#include +#include + #include "openvino/core/parallel.hpp" #include "openvino/core/type/float16.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/opsets/opset3.hpp" #include "utils/bfloat16.hpp" -#include -#include - namespace ov { namespace intel_cpu { namespace node { @@ -38,10 +39,11 @@ CumSum::CumSum(const std::shared_ptr& op, const GraphContext::CPtr con errorPrefix = "CumSum layer with name '" + op->get_friendly_name() + "' "; - if ((getOriginalInputsNumber() != numOfInputs && getOriginalInputsNumber() != (numOfInputs - 1)) || getOriginalOutputsNumber() != 1) + if ((getOriginalInputsNumber() != numOfInputs && getOriginalInputsNumber() != (numOfInputs - 1)) || + getOriginalOutputsNumber() != 1) OPENVINO_THROW(errorPrefix, " has incorrect number of input/output edges!"); - const auto &dataShape = getInputShapeAtPort(CUM_SUM_DATA); + const auto& dataShape = getInputShapeAtPort(CUM_SUM_DATA); numOfDims = dataShape.getRank(); if (numOfDims < 1) { OPENVINO_THROW(errorPrefix, " doesn't support 'data' input tensor with rank: ", numOfDims); @@ -70,13 +72,19 @@ void CumSum::initSupportedPrimitiveDescriptors() { dataPrecision = getOriginalInputPrecisionAtPort(CUM_SUM_DATA); if (!one_of(dataPrecision, - ov::element::i8, ov::element::u8, - ov::element::i16, ov::element::i32, ov::element::i64, ov::element::u64, - ov::element::bf16, ov::element::f16, ov::element::f32)) + ov::element::i8, + ov::element::u8, + ov::element::i16, + ov::element::i32, + ov::element::i64, + ov::element::u64, + ov::element::bf16, + ov::element::f16, + ov::element::f32)) OPENVINO_THROW(errorPrefix, " has unsupported 'data' input precision: ", dataPrecision.get_type_name()); if (inputShapes.size() == numOfInputs) { - const auto &axisTensorPrec = getOriginalInputPrecisionAtPort(AXIS); + const auto& axisTensorPrec = getOriginalInputPrecisionAtPort(AXIS); if (axisTensorPrec != ov::element::i32 && axisTensorPrec != ov::element::i64) OPENVINO_THROW(errorPrefix, " has unsupported 'axis' input precision: ", axisTensorPrec.get_type_name()); } @@ -87,16 +95,17 @@ void CumSum::initSupportedPrimitiveDescriptors() { for (size_t i = 1; i < inputShapes.size(); ++i) inDataConf.emplace_back(LayoutType::ncsp, ov::element::i32); - addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, dataPrecision}}, - impl_desc_type::ref_any); + addSupportedPrimDesc(inDataConf, {{LayoutType::ncsp, dataPrecision}}, impl_desc_type::ref_any); } void CumSum::execute(dnnl::stream strm) { if (inputShapes.size() == numOfInputs) axis = getAxis(getParentEdgeAt(AXIS)->getMemory(), getParentEdgeAt(CUM_SUM_DATA)->getMemory()); - OV_SWITCH(intel_cpu, CumSumExecute, this, dataPrecision, + OV_SWITCH(intel_cpu, + CumSumExecute, + this, + dataPrecision, OV_CASE(ov::element::i8, int8_t), OV_CASE(ov::element::u8, uint8_t), OV_CASE(ov::element::i16, int16_t), @@ -110,9 +119,10 @@ void CumSum::execute(dnnl::stream strm) { template void CumSum::exec() { - const auto *input = getSrcDataAtPortAs(CUM_SUM_DATA); - auto *output = getDstDataAtPortAs(0); - const VectorDims strides = getParentEdgeAt(CUM_SUM_DATA)->getMemory().getDescWithType()->getStrides(); + const auto* input = getSrcDataAtPortAs(CUM_SUM_DATA); + auto* output = getDstDataAtPortAs(0); + const VectorDims strides = + getParentEdgeAt(CUM_SUM_DATA)->getMemory().getDescWithType()->getStrides(); if (reverse) { if (exclusive) { @@ -130,16 +140,17 @@ void CumSum::exec() { } template -void CumSum::cumSum(const dataType *input, dataType *output, const VectorDims &strides) { +void CumSum::cumSum(const dataType* input, dataType* output, const VectorDims& strides) { VectorDims iterationRange(numOfDims - 1); size_t j = 0; - const auto &shape = getParentEdgeAt(CUM_SUM_DATA)->getMemory().getStaticDims(); + const auto& shape = getParentEdgeAt(CUM_SUM_DATA)->getMemory().getStaticDims(); for (size_t i = 0; i < shape.size(); i++) { if (i == axis) continue; iterationRange[j++] = shape[i]; } - size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), size_t(1), std::multiplies()); + size_t work_amount_dst = + std::accumulate(iterationRange.begin(), iterationRange.end(), size_t(1), std::multiplies()); parallel_nt(0, [&](const int ithr, const int nthr) { size_t start = 0, end = 0; VectorDims counters(numOfDims - 1, 0); @@ -159,32 +170,32 @@ void CumSum::cumSum(const dataType *input, dataType *output, const VectorDims &s size_t startOffset = getStartOffset(forStartOffset, strides); - const dataType *inputStart = input + startOffset; - dataType *outputStart = output + startOffset; + const dataType* inputStart = input + startOffset; + dataType* outputStart = output + startOffset; size_t offset = strides[axis]; if (reverse) { if (exclusive) { - outputStart[offset*(shape[axis] - 1)] = 0; + outputStart[offset * (shape[axis] - 1)] = 0; for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset]; + outputStart[i * offset] = inputStart[(i + 1) * offset] + outputStart[(i + 1) * offset]; } } else { - outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; + outputStart[offset * (shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset]; + outputStart[i * offset] = inputStart[i * offset] + outputStart[(i + 1) * offset]; } } } else { if (exclusive) { outputStart[0] = 0; for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset]; + outputStart[i * offset] = inputStart[(i - 1) * offset] + outputStart[(i - 1) * offset]; } } else { outputStart[0] = inputStart[0]; for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset]; + outputStart[i * offset] = inputStart[i * offset] + outputStart[(i - 1) * offset]; } } } @@ -219,7 +230,8 @@ inline void CumSum::parallelItStep(std::vector& counters, const std::vec } } -inline size_t CumSum::getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const { +inline size_t CumSum::getStartOffset(const std::vector& forStartOffset, + const std::vector& strides) const { size_t startOffset = 0; for (size_t idx = 0; idx < forStartOffset.size(); ++idx) { startOffset += forStartOffset[idx] * strides[idx]; @@ -232,19 +244,19 @@ size_t CumSum::getAxis(const IMemory& _axis, const IMemory& _data) const { const int64_t dataShapeSize = static_cast(_data.getShape().getRank()); int64_t axisValueFromBlob = 0; switch (axisPrecision) { - case ov::element::i32 : { - const auto *axisPtr = _axis.getDataAs(); - axisValueFromBlob = static_cast(axisPtr[0]); - break; - } - case ov::element::i64 : { - const auto *axisPtr = _axis.getDataAs(); - axisValueFromBlob = axisPtr[0]; - break; - } - default : { - OPENVINO_THROW(errorPrefix, " doesn't support 'axis' input with precision: ", axisPrecision.get_type_name()); - } + case ov::element::i32: { + const auto* axisPtr = _axis.getDataAs(); + axisValueFromBlob = static_cast(axisPtr[0]); + break; + } + case ov::element::i64: { + const auto* axisPtr = _axis.getDataAs(); + axisValueFromBlob = axisPtr[0]; + break; + } + default: { + OPENVINO_THROW(errorPrefix, " doesn't support 'axis' input with precision: ", axisPrecision.get_type_name()); + } } if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1) OPENVINO_THROW(errorPrefix, " has axis with a value out of range: ", axisValueFromBlob); @@ -263,6 +275,6 @@ void CumSum::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/cum_sum.h b/src/plugins/intel_cpu/src/nodes/cum_sum.h index b0aad351d55f93..139c7205e81fcc 100644 --- a/src/plugins/intel_cpu/src/nodes/cum_sum.h +++ b/src/plugins/intel_cpu/src/nodes/cum_sum.h @@ -14,7 +14,7 @@ class CumSum : public Node { public: CumSum(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -29,13 +29,13 @@ class CumSum : public Node { void exec(); template - void cumSum(const dataType *input, dataType *output, const std::vector &strides); + void cumSum(const dataType* input, dataType* output, const std::vector& strides); void parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange); inline void parallelItStep(std::vector& counters, const std::vector& iterationRange); - inline size_t getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const; + inline size_t getStartOffset(const std::vector& forStartOffset, const std::vector& strides) const; size_t getAxis(const IMemory& _axis, const IMemory& _data) const; @@ -48,7 +48,7 @@ class CumSum : public Node { ov::element::Type dataPrecision; std::string errorPrefix; - template + template struct CumSumExecute { void operator()(CumSum* node) { node->exec(); @@ -56,6 +56,6 @@ class CumSum : public Node { }; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp index 2ee858e730c900..f30e3481afbb3d 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/deconv.cpp @@ -4,16 +4,15 @@ #include "deconv.h" -#include "dnnl_extension_utils.h" #include #include -#include "common/primitive_hashing_utils.hpp" #include #include -#include "cpu/x64/cpu_isa_traits.hpp" -#include "shape_inference/shape_inference_ngraph.hpp" +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/cpu_isa_traits.hpp" +#include "dnnl_extension_utils.h" #include "eltwise.h" #include "fake_quantize.h" #include "input.h" @@ -21,16 +20,16 @@ #include "openvino/core/parallel.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/runtime/make_tensor.hpp" -#include "utils/general_utils.h" +#include "shape_inference/shape_inference.hpp" #include "utils/cpu_utils.hpp" +#include "utils/general_utils.h" #if defined(OV_CPU_WITH_ACL) -#include "executors/acl/acl_utils.hpp" -#include "utils/debug_capabilities.h" +# include "executors/acl/acl_utils.hpp" +# include "utils/debug_capabilities.h" #endif #include - #include #include @@ -40,8 +39,8 @@ namespace ov { namespace intel_cpu { namespace node { -using DefaultDeconvDescs = std::pair; +using DefaultDeconvDescs = + std::pair; using Int8DeconvDesc = dnnl::deconvolution_forward::primitive_desc; namespace { @@ -92,7 +91,7 @@ size_t DeconvKey::hash() const { return seed; } -bool DeconvKey::operator==(const DeconvKey &rhs) const { +bool DeconvKey::operator==(const DeconvKey& rhs) const { bool retVal = true; if (inp0 != rhs.inp0) { retVal = retVal && inp0 && rhs.inp0 && inp0->getDnnlDesc() == rhs.inp0->getDnnlDesc(); @@ -122,29 +121,31 @@ bool DeconvKey::operator==(const DeconvKey &rhs) const { } /** - * Deconvolution shape inference factory. It defines the input mask depending on the existence of the `output_shape` input. - * Since in case it exists, plugin should pass the input data to the shape inference function. + * Deconvolution shape inference factory. It defines the input mask depending on the existence of the `output_shape` + * input. Since in case it exists, plugin should pass the input data to the shape inference function. * */ class DeconfolutionShapeInferFactory : public ShapeInferFactory { public: - DeconfolutionShapeInferFactory(std::shared_ptr op) : m_op(op) {} + DeconfolutionShapeInferFactory(std::shared_ptr op) : m_op(std::move(op)) {} + ShapeInferPtr makeShapeInfer() const override { - if (m_op->get_input_size() > 2) { - return std::make_shared(make_shape_inference(m_op), PortMask(2)); - } - return std::make_shared(make_shape_inference(m_op), EMPTY_PORT_MASK); + const auto port_mask = (m_op->get_input_size() > 2) ? PortMask(2) : EMPTY_PORT_MASK; + return make_shape_inference(m_op, port_mask); } + private: std::shared_ptr m_op; }; -} // namespace +} // namespace -bool Deconvolution::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool Deconvolution::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (std::dynamic_pointer_cast(op) == nullptr && - std::dynamic_pointer_cast(op) == nullptr) { - errorMessage = "Only opset1 ConvolutionBackpropData and GroupConvolutionBackpropData operations are supported"; + std::dynamic_pointer_cast(op) == nullptr) { + errorMessage = + "Only opset1 ConvolutionBackpropData and GroupConvolutionBackpropData operations are supported"; return false; } size_t ndims = op->get_input_partial_shape(0).rank().get_length(); @@ -152,7 +153,8 @@ bool Deconvolution::isSupportedOperation(const std::shared_ptr& errorMessage = "Only 3D, 4D and 5D blobs are supported as input"; return false; } - if (op->get_input_partial_shape(1).is_dynamic() || (op->get_input_size() > 2 && op->get_input_partial_shape(2).is_dynamic())) { + if (op->get_input_partial_shape(1).is_dynamic() || + (op->get_input_size() > 2 && op->get_input_partial_shape(2).is_dynamic())) { errorMessage = "Doesn't support dynamic shapes for 'weights' and 'output_shape' inputs"; return false; } @@ -162,8 +164,8 @@ bool Deconvolution::isSupportedOperation(const std::shared_ptr& return true; } -Deconvolution::Deconvolution(const std::shared_ptr& op, - const GraphContext::CPtr context) : Node(op, context, DeconfolutionShapeInferFactory(op)) { +Deconvolution::Deconvolution(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, DeconfolutionShapeInferFactory(op)) { std::string errorMessage; errorPrefix = "Deconvolution node with name '" + getName() + "' "; if (!isSupportedOperation(op, errorMessage)) @@ -176,7 +178,7 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, IC = weightDims[0]; OC = weightDims[1]; - expectedBiasDims = {OC}; + expectedBiasDims = {OC}; groupNum = 1; withGroups = false; @@ -199,7 +201,7 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, groupNum = weightDims[0]; IC = groupNum * weightDims[1]; OC = groupNum * weightDims[2]; - expectedBiasDims = {OC}; + expectedBiasDims = {OC}; withGroups = groupNum > 1; isDW = withGroups && groupNum == OC && groupNum == IC; @@ -229,8 +231,11 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, lastOutputSpatialDims = ov::as_type(op->get_input_node_ptr(2))->cast_vector(); if (externOutShape && isDynamicNode()) { const auto spDimsNum = getInputShapeAtPort(0).getRank() - 2; - if (getInputShapeAtPort(2).getStaticDims()[0] != spDimsNum || (isConstOutShape && lastOutputSpatialDims.size() != spDimsNum)) { - OPENVINO_THROW(errorPrefix, "'output_shape' input has incorrect number of elements. Expected = ", spDimsNum); + if (getInputShapeAtPort(2).getStaticDims()[0] != spDimsNum || + (isConstOutShape && lastOutputSpatialDims.size() != spDimsNum)) { + OPENVINO_THROW(errorPrefix, + "'output_shape' input has incorrect number of elements. Expected = ", + spDimsNum); } } @@ -240,8 +245,10 @@ Deconvolution::Deconvolution(const std::shared_ptr& op, for (size_t i = 0; i < spatialRank; ++i) is1x1 = is1x1 && *(weightDimsReversItr++) == 1; // 1x1 deconv has some test case failed. The cause is upstream ONEDNN unsupported brgemm implementation cases are - // enabled in forked ONEDNNN https://github.com/openvinotoolkit/oneDNN/blob/117e287000b48a34a7218fcaa274a91571141728/src/common/convolution.cpp#L138. - // Some test cases on 1x1 kernel failed on accuracy check, current WA is disabling brgemm deconv implementation for such cases. + // enabled in forked ONEDNNN + // https://github.com/openvinotoolkit/oneDNN/blob/117e287000b48a34a7218fcaa274a91571141728/src/common/convolution.cpp#L138. + // Some test cases on 1x1 kernel failed on accuracy check, current WA is disabling brgemm deconv implementation for + // such cases. if (is1x1 && deconvAttrs.paddingL != deconvAttrs.paddingR) { // case1: Specify asymmetric padding explicitly asymmetricPaddingAnd1x1 = true; @@ -290,7 +297,9 @@ bool Deconvolution::canBeExecutedInInt8() const { return false; if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core)) { const auto& inMaxDims = getOutputShapeAtPort(0).getMaxDims(); - if (std::any_of(inMaxDims.begin(), inMaxDims.end(), [](Dim dim) { return dim == Shape::UNDEFINED_DIM; })) { + if (std::any_of(inMaxDims.begin(), inMaxDims.end(), [](Dim dim) { + return dim == Shape::UNDEFINED_DIM; + })) { return false; } // heuristicConst = 2^26 @@ -310,7 +319,8 @@ bool Deconvolution::canBeExecutedInInt8() const { // not supported in oneDNN int channelBlock = impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core) ? 16 - : impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) ? 8 : 4; + : impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) ? 8 + : 4; if (withGroups && !isDW && (IC % channelBlock != 0 || OC % channelBlock != 0)) return false; if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core) && deconvAttrs.stride.back() > 3) @@ -331,16 +341,18 @@ bool Deconvolution::canBeExecutedInInt8() const { bool Deconvolution::canFuse(const NodePtr& node) const { if (canBeExecutedInInt8()) return canFuseSimpleOperation(node); - // Upstream ONEDNN conv_backward_data primitive can't support any post-ops, fork onednn added depthwise support in conv_backward_data JIT implementation. - // ONEDNN deconv primitive can support most of post-ops, but the post-ops implementation details are different. - // So current deconv implementation list in onednn has 2 kinds of implements: + // Upstream ONEDNN conv_backward_data primitive can't support any post-ops, fork onednn added depthwise support in + // conv_backward_data JIT implementation. ONEDNN deconv primitive can support most of post-ops, but the post-ops + // implementation details are different. So current deconv implementation list in onednn has 2 kinds of implements: // 1. deconv implementation with JIT post-ops supported in the kernel (such as brgdeconv) - // 2. forked conv_data_backwards implementation with JIT depthwise post-ops + reference implementation for other post ops. - // Considering that some deconv fallback on the JIT implementation, we limit the post ops fusing to avoid regressions. - // Regression with stylegan2 int8 model pattern: - // none-quantzied deconv(with none-const weight) + FQ pattern fall back on JIT because of onednn limitation. (fall back ticket MFDNN-11577). - // If FQ is fused, it runs with the ref post-ops implementation. - // @todo: if onednn can ensure all the deconv run with the brgemm implementation, we can unify the fuse criteria between int8 and fp32 use cases. + // 2. forked conv_data_backwards implementation with JIT depthwise post-ops + reference implementation for other + // post ops. + // Considering that some deconv fallback on the JIT implementation, we limit the post ops fusing to avoid + // regressions. Regression with stylegan2 int8 model pattern: none-quantzied deconv(with none-const weight) + FQ + // pattern fall back on JIT because of onednn limitation. (fall back ticket MFDNN-11577). If FQ is fused, it runs + // with the ref post-ops implementation. + // @todo: if onednn can ensure all the deconv run with the brgemm implementation, we can unify the fuse criteria + // between int8 and fp32 use cases. return (fusedWith.empty() && node->canBePerformedAsScaleShift(this)); } @@ -361,8 +373,10 @@ std::pair Deconvolution::makeDummyInOutShape() { const auto& maxDims = shape.getMaxDims(); const auto& dims = shape.getDims(); for (size_t i = 0; i < dims.size() - 2; ++i) { - lastOutputSpatialDims[i] = dims[i + 2] == Shape::UNDEFINED_DIM ? std::min(maxDims[i + 2], - std::max(minDims[i + 2], static_cast(64))) : dims[i + 2]; + lastOutputSpatialDims[i] = + dims[i + 2] == Shape::UNDEFINED_DIM + ? std::min(maxDims[i + 2], std::max(minDims[i + 2], static_cast(64))) + : dims[i + 2]; } } @@ -381,14 +395,18 @@ std::pair Deconvolution::makeDummyInOutShape() { for (size_t i = 0; i < origInDims.size() - 2; i++) { if (origInDims[i + 2] == Shape::UNDEFINED_DIM && (origInMinDims[i + 2] != 0 || origInMaxDims[i + 2] != Shape::UNDEFINED_DIM)) { - // if input shape is dynamic and bounded, paddings should be computed basing on the following limitations: + // if input shape is dynamic and bounded, paddings should be computed basing on the following + // limitations: // 1. paddings must not be negative - // 2. the result padding must have such a value to keep the dummy dimensions inside the predefined interval - auto c1 = lastOutputSpatialDims[i] - deconvAttrs.outputPadding[i] - 1 - - (deconvAttrs.dilation[i] + 1) * static_cast(weightDims[wghOffset + 2 + i] - 1); + // 2. the result padding must have such a value to keep the dummy dimensions inside the + // predefined interval + auto c1 = + lastOutputSpatialDims[i] - deconvAttrs.outputPadding[i] - 1 - + (deconvAttrs.dilation[i] + 1) * static_cast(weightDims[wghOffset + 2 + i] - 1); if (origInMaxDims[i + 2] != Shape::UNDEFINED_DIM) { - auto upper_bound = deconvAttrs.stride[i] * static_cast(origInMaxDims[i + 2] - 1) - c1; + auto upper_bound = + deconvAttrs.stride[i] * static_cast(origInMaxDims[i + 2] - 1) - c1; if (upper_bound < 0) { OPENVINO_THROW(errorPrefix, ": paddings for dummy shapes can't be computed"); } @@ -404,9 +422,11 @@ std::pair Deconvolution::makeDummyInOutShape() { for (size_t i = 0; i < inputDims.size() - 2; i++) { if (origInDims[2 + i] == Shape::UNDEFINED_DIM) { - inputDims[2 + i] = (lastOutputSpatialDims[i] - (deconvAttrs.dilation[i] + 1) * - (weightDims[wghOffset + 2 + i] - 1) - 1 + paddings[i] - deconvAttrs.outputPadding[i]) / - deconvAttrs.stride[i] + 1; + inputDims[2 + i] = (lastOutputSpatialDims[i] - + (deconvAttrs.dilation[i] + 1) * (weightDims[wghOffset + 2 + i] - 1) - 1 + + paddings[i] - deconvAttrs.outputPadding[i]) / + deconvAttrs.stride[i] + + 1; } } } @@ -457,12 +477,14 @@ void Deconvolution::getSupportedDescriptors() { if (!descs.empty()) return; isInt8 = canBeExecutedInInt8(); - deconvAttrs.withBiasesParam = withBiases = externOutShape ? getOriginalInputsNumber() == 4 : getOriginalInputsNumber() == 3; + deconvAttrs.withBiasesParam = withBiases = + externOutShape ? getOriginalInputsNumber() == 4 : getOriginalInputsNumber() == 3; ov::element::Type inPrecision = getOriginalInputPrecisionAtPort(0); ov::element::Type outPrecision = getOriginalOutputPrecisionAtPort(0); if (isInt8) { - // TODO: We have to extend jit_avx512_core_x8s8s32x_deconv_fwd_kernel from oneDNN to support BF16 output data type + // TODO: We have to extend jit_avx512_core_x8s8s32x_deconv_fwd_kernel from oneDNN to support BF16 output data + // type if (ov::element::bf16 == inPrecision) inPrecision = ov::element::f32; if (ov::element::bf16 == outPrecision) @@ -476,11 +498,12 @@ void Deconvolution::getSupportedDescriptors() { auto inputDataType = DnnlExtensionUtils::ElementTypeToDataType(inPrecision); outputDataType = DnnlExtensionUtils::ElementTypeToDataType(outPrecision); if (inputDataType == memory::data_type::bf16 || outputDataType == memory::data_type::bf16) - inputDataType = outputDataType = memory::data_type::bf16; + inputDataType = outputDataType = memory::data_type::bf16; if (inputDataType == memory::data_type::f16 || outputDataType == memory::data_type::f16) - inputDataType = outputDataType = memory::data_type::f16; + inputDataType = outputDataType = memory::data_type::f16; if (!fusedWith.empty()) { - outputDataType = DnnlExtensionUtils::ElementTypeToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); + outputDataType = DnnlExtensionUtils::ElementTypeToDataType( + fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); } if (getParentEdges().size() != (withBiases ? (biasPort + 1) : biasPort)) { OPENVINO_THROW(errorPrefix, " has incorrect number of input edges"); @@ -490,7 +513,7 @@ void Deconvolution::getSupportedDescriptors() { } VectorDims inDims, outDims; std::tie(inDims, outDims) = makeDummyInOutShape(); - inShape = Shape(inDims); + inShape = Shape(inDims); outShape = Shape(outDims); initPaddingR(inShape, outShape); @@ -506,17 +529,18 @@ void Deconvolution::getSupportedDescriptors() { config.outConfs.resize(getOriginalOutputsNumber()); // ACL use same precision for all inputs config.inConfs[0].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); config.inConfs[1].setMemDesc( - creatorsMap.at(weights_format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); + creatorsMap.at(weights_format) + ->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); for (size_t i = 2; i < getParentEdges().size(); ++i) { config.inConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); } for (size_t i = 0; i < config.outConfs.size(); ++i) { config.outConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); } std::vector srcMemoryDescs; @@ -533,7 +557,8 @@ void Deconvolution::getSupportedDescriptors() { return AclDeconvExecutorBuilder::customIsSupported(deconvAttrs, srcMemoryDescs, dstMemoryDescs); }; useACL = checkDesc(LayoutType::nspc) || checkDesc(LayoutType::ncsp); - if (useACL) return; + if (useACL) + return; #endif dnnlCompatibleWeiDims = getWeightDims(); // Construct the ONEDNN deconv OP weight shape. @@ -548,26 +573,30 @@ void Deconvolution::getSupportedDescriptors() { auto format = rank == 5 ? dnnl::memory::format_tag::ndhwc : rank == 4 ? dnnl::memory::format_tag::nhwc : dnnl::memory::format_tag::nwc; - MemoryDescPtr in_candidate = std::make_shared(getInputShapeAtPort(0), inputDataType, format); - MemoryDescPtr out_candidate = std::make_shared(getOutputShapeAtPort(0), outputDataType, format); + MemoryDescPtr in_candidate = + std::make_shared(getInputShapeAtPort(0), inputDataType, format); + MemoryDescPtr out_candidate = + std::make_shared(getOutputShapeAtPort(0), outputDataType, format); createDescriptor({in_candidate}, {out_candidate}); } else { for (auto format : getAvailableFormatsForDims(getInputShapeAtPort(0))) { - MemoryDescPtr in_candidate = std::make_shared(getInputShapeAtPort(0), inputDataType, format); - MemoryDescPtr out_candidate = std::make_shared(getOutputShapeAtPort(0), outputDataType, format); + MemoryDescPtr in_candidate = + std::make_shared(getInputShapeAtPort(0), inputDataType, format); + MemoryDescPtr out_candidate = + std::make_shared(getOutputShapeAtPort(0), outputDataType, format); createDescriptor({in_candidate}, {out_candidate}); } } } -void Deconvolution::initPaddingR(const Shape &inShape, const Shape &outShape) { +void Deconvolution::initPaddingR(const Shape& inShape, const Shape& outShape) { for (size_t i = 0; i < deconvAttrs.paddingR.size(); i++) { int with_group = getAlgorithm() == Algorithm::DeconvolutionGrouped ? 1 : 0; const auto& weightDims = getWeightDims(); int krn = weightDims[with_group + 2 + i]; int src = outShape.getStaticDims()[2 + i]; int dst = inShape.getStaticDims()[2 + i]; - krn = (krn - 1)*(deconvAttrs.dilation[i] + 1) + 1; + krn = (krn - 1) * (deconvAttrs.dilation[i] + 1) + 1; deconvAttrs.paddingR[i] = (dst - 1) * deconvAttrs.stride[i] - (src - krn + deconvAttrs.paddingL[i]); } } @@ -585,11 +614,22 @@ void Deconvolution::setPostOps(dnnl::primitive_attr& attr, const VectorDims& dim // For deconv OP, Deconv_OC = IC, Deconv_IC = OC. // Openvino per-channel weight scales are applied on IC/Deconv_OC dimension. // So for deconvolution, - // Weight dims in NON-Group deconv: [Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on Deconv_OC DIM + // Weight dims in NON-Group deconv: [Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on Deconv_OC + // DIM // weiScaleMaskPerChannel = 1 << 0 - // Weight dims in Group deconv: [Group, Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on GROUP and Deconv_OC, + // Weight dims in Group deconv: [Group, Deconv_OC, Deconv_IC, KH, KW], perchannel weight scale is applied on + // GROUP and Deconv_OC, // weiScaleMaskPerChannel = ( 1 << 0 | 1 << 1) = 0x03 - DnnlPostOpsComposerLegacy dnnlpoc(getEngine(), attr, ops, postOpsArgs, dims, 1, isInt8, withGroups ? 3 : 1 << 0, getDQScales(), withBiases); + DnnlPostOpsComposerLegacy dnnlpoc(getEngine(), + attr, + ops, + postOpsArgs, + dims, + 1, + isInt8, + withGroups ? 3 : 1 << 0, + getDQScales(), + withBiases); for (size_t i = 0; i < fusedWith.size(); ++i) { auto& node = fusedWith[i]; @@ -634,7 +674,7 @@ bool Deconvolution::needShapeInfer() const { return false; } -VectorDims Deconvolution::shapeInferInternal(const VectorDims &inDims, std::vector outSpDims) const { +VectorDims Deconvolution::shapeInferInternal(const VectorDims& inDims, std::vector outSpDims) const { std::vector> inputShapesRefs{std::ref(inDims), std::ref(getWeightDims())}; std::unordered_map inputValues; VectorDims outSpDimsVecShape; @@ -679,7 +719,7 @@ void Deconvolution::execute(dnnl::stream strm) { for (size_t i = 0; i < getOriginalOutputsNumber(); i++) { dstMemory.push_back(getDstMemoryAtPort(i)); } - //TODO: need to pass post ops data + // TODO: need to pass post ops data execPtrDeconvACL->exec(srcMemory, dstMemory, nullptr); return; } @@ -697,43 +737,50 @@ void Deconvolution::execute(dnnl::stream strm) { namespace { dnnl::primitive_desc createDescriptorInternal(const dnnl::memory::desc& in_candidate, - const dnnl::memory::desc& wgh_candidate, - const dnnl::memory::desc& bias_candidate, - const dnnl::memory::desc& out_candidate, - const bool with_bias, - const std::vector& stride, - const std::vector& dilation, - const ov::CoordinateDiff& paddingL, - const ov::CoordinateDiff& paddingR, - const dnnl::primitive_attr& attr, - const dnnl::engine& engine) { - auto convertDims = [] (const std::vector& orig_dims) { + const dnnl::memory::desc& wgh_candidate, + const dnnl::memory::desc& bias_candidate, + const dnnl::memory::desc& out_candidate, + const bool with_bias, + const std::vector& stride, + const std::vector& dilation, + const ov::CoordinateDiff& paddingL, + const ov::CoordinateDiff& paddingR, + const dnnl::primitive_attr& attr, + const dnnl::engine& engine) { + auto convertDims = [](const std::vector& orig_dims) { return memory::dims(orig_dims.begin(), orig_dims.end()); }; if (with_bias) { - return dnnl::deconvolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - dnnl::algorithm::deconvolution_direct, - in_candidate, wgh_candidate, bias_candidate, out_candidate, - convertDims(stride), convertDims(dilation), - convertDims(paddingL), convertDims(paddingR), - attr); + return dnnl::deconvolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + dnnl::algorithm::deconvolution_direct, + in_candidate, + wgh_candidate, + bias_candidate, + out_candidate, + convertDims(stride), + convertDims(dilation), + convertDims(paddingL), + convertDims(paddingR), + attr); } else { - return dnnl::deconvolution_forward::primitive_desc( - engine, - prop_kind::forward_inference, - dnnl::algorithm::deconvolution_direct, - in_candidate, wgh_candidate, out_candidate, - convertDims(stride), convertDims(dilation), - convertDims(paddingL), convertDims(paddingR), - attr); + return dnnl::deconvolution_forward::primitive_desc(engine, + prop_kind::forward_inference, + dnnl::algorithm::deconvolution_direct, + in_candidate, + wgh_candidate, + out_candidate, + convertDims(stride), + convertDims(dilation), + convertDims(paddingL), + convertDims(paddingR), + attr); } } -} // namespace +} // namespace -Node::AttrPtr Deconvolution::makePrimitiveAttr(const VectorDims &dims) { +Node::AttrPtr Deconvolution::makePrimitiveAttr(const VectorDims& dims) { auto attr = std::make_shared(dnnl::primitive_attr()); setPostOps(*attr, dims); @@ -748,81 +795,61 @@ Node::AttrPtr Deconvolution::initPrimitiveAttr() { const std::vector& Deconvolution::getDefaultImplPriority() { static const std::vector priorities { impl_desc_type::unknown, - // Undef impl type is used to express use-cases there real type is unkown during compilation - // Undef has higher priority than defined types in order to force primitive selection logic to make decision based on other properties - impl_desc_type::undef, - impl_desc_type::brgconv_avx512_amx_1x1, - impl_desc_type::brgconv_avx512_amx, - impl_desc_type::jit_avx512_amx_dw, - impl_desc_type::jit_avx512_amx_1x1, - impl_desc_type::jit_avx512_amx, - impl_desc_type::brgconv_avx512_1x1, - impl_desc_type::brgconv_avx512, - impl_desc_type::jit_avx512_dw, - impl_desc_type::jit_avx512_1x1, - impl_desc_type::jit_avx512, - impl_desc_type::brgconv_avx2_1x1, - impl_desc_type::brgconv_avx2, - impl_desc_type::jit_uni_dw, - impl_desc_type::jit_uni_1x1, - impl_desc_type::jit_uni, - impl_desc_type::jit_avx2_dw, - impl_desc_type::jit_avx2_1x1, - impl_desc_type::jit_avx2, - impl_desc_type::jit_avx_dw, - impl_desc_type::jit_avx_1x1, - impl_desc_type::jit_avx, - impl_desc_type::jit_sse42_dw, - impl_desc_type::jit_sse42_1x1, - impl_desc_type::jit_sse42, + // Undef impl type is used to express use-cases there real type is unkown during compilation + // Undef has higher priority than defined types in order to force primitive selection logic to make decision + // based on other properties + impl_desc_type::undef, impl_desc_type::brgconv_avx512_amx_1x1, impl_desc_type::brgconv_avx512_amx, + impl_desc_type::jit_avx512_amx_dw, impl_desc_type::jit_avx512_amx_1x1, impl_desc_type::jit_avx512_amx, + impl_desc_type::brgconv_avx512_1x1, impl_desc_type::brgconv_avx512, impl_desc_type::jit_avx512_dw, + impl_desc_type::jit_avx512_1x1, impl_desc_type::jit_avx512, impl_desc_type::brgconv_avx2_1x1, + impl_desc_type::brgconv_avx2, impl_desc_type::jit_uni_dw, impl_desc_type::jit_uni_1x1, + impl_desc_type::jit_uni, impl_desc_type::jit_avx2_dw, impl_desc_type::jit_avx2_1x1, + impl_desc_type::jit_avx2, impl_desc_type::jit_avx_dw, impl_desc_type::jit_avx_1x1, impl_desc_type::jit_avx, + impl_desc_type::jit_sse42_dw, impl_desc_type::jit_sse42_1x1, impl_desc_type::jit_sse42, #if defined(OPENVINO_ARCH_ARM64) - impl_desc_type::jit_asimd, + impl_desc_type::jit_asimd, #endif - impl_desc_type::gemm_any, - impl_desc_type::gemm_blas, - impl_desc_type::gemm_avx512, - impl_desc_type::gemm_avx2, - impl_desc_type::gemm_avx, - impl_desc_type::gemm_sse42, - impl_desc_type::gemm_acl, - impl_desc_type::acl, - impl_desc_type::jit_gemm, - impl_desc_type::ref_any, - impl_desc_type::ref, + impl_desc_type::gemm_any, impl_desc_type::gemm_blas, impl_desc_type::gemm_avx512, impl_desc_type::gemm_avx2, + impl_desc_type::gemm_avx, impl_desc_type::gemm_sse42, impl_desc_type::gemm_acl, impl_desc_type::acl, + impl_desc_type::jit_gemm, impl_desc_type::ref_any, impl_desc_type::ref, }; if (!asymmetricPaddingAnd1x1) return priorities; static const std::vector priorities_wo_brgemm = [&] { - std::vectorresult; - std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), - [](impl_desc_type type) { return !(type & impl_desc_type::brgconv); }); - return result;}(); + std::vector result; + std::copy_if(priorities.begin(), priorities.end(), std::back_inserter(result), [](impl_desc_type type) { + return !(type & impl_desc_type::brgconv); + }); + return result; + }(); return priorities_wo_brgemm; } bool Deconvolution::isImplicit1x1PaddingAsymmetric(const VectorDims& inputDims) { - auto isZero = [](std::ptrdiff_t i) { return i == 0; }; + auto isZero = [](std::ptrdiff_t i) { + return i == 0; + }; size_t spatialRank = getInputShapeAtPort(0).getRank() - 2; - if (is1x1 && std::all_of(deconvAttrs.paddingR.begin(), deconvAttrs.paddingR.end(), isZero) - && std::all_of(deconvAttrs.paddingL.begin(), deconvAttrs.paddingL.end(), isZero) - && std::all_of(deconvAttrs.outputPadding.begin(), deconvAttrs.outputPadding.end(), isZero) - ) { - auto calPaddingEnd = [](int64_t i, int64_t o, int64_t s) -> int64_t { - // Accoriding to https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html, - // output[i] = (input[i] -1) * stride[i] - 2 x padding[i] + dilation[i] x (kernel_size[i] - 1) + output_padding[i] + 1. - // When kernel_size[i] = 1, output_padding = 0, output[i] = (input[i] -1) * stride[i] - 2 x padding[i] + 1. - // implicit padding end = 2 x padding[i] = (input[i] -1) * stride[i] + 1 - output[i] - return (i - 1) * s + 1 - o;}; - for (size_t i = 0; i < spatialRank; i++) { - int64_t inputDim = static_cast(inputDims[i + 2]); - int64_t outputDim = static_cast(lastOutputSpatialDims[i]); - int64_t stride = static_cast(deconvAttrs.stride[i]); - if (calPaddingEnd(inputDim, outputDim, stride) > 0) { - return true; - } + if (is1x1 && std::all_of(deconvAttrs.paddingR.begin(), deconvAttrs.paddingR.end(), isZero) && + std::all_of(deconvAttrs.paddingL.begin(), deconvAttrs.paddingL.end(), isZero) && + std::all_of(deconvAttrs.outputPadding.begin(), deconvAttrs.outputPadding.end(), isZero)) { + auto calPaddingEnd = [](int64_t i, int64_t o, int64_t s) -> int64_t { + // Accoriding to https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html, + // output[i] = (input[i] -1) * stride[i] - 2 x padding[i] + dilation[i] x (kernel_size[i] - 1) + + // output_padding[i] + 1. When kernel_size[i] = 1, output_padding = 0, output[i] = (input[i] -1) * stride[i] + // - 2 x padding[i] + 1. implicit padding end = 2 x padding[i] = (input[i] -1) * stride[i] + 1 - output[i] + return (i - 1) * s + 1 - o; + }; + for (size_t i = 0; i < spatialRank; i++) { + int64_t inputDim = static_cast(inputDims[i + 2]); + int64_t outputDim = static_cast(lastOutputSpatialDims[i]); + int64_t stride = static_cast(deconvAttrs.stride[i]); + if (calPaddingEnd(inputDim, outputDim, stride) > 0) { + return true; } + } } return false; } @@ -855,8 +882,10 @@ void Deconvolution::prepareParams() { dstMemoryDescs.push_back(getChildEdgeAt(i)->getMemory().getDescWithType()); } - execPtrDeconvACL = selected_pd->getExecutorFactoryAs()->makeExecutor(deconvAttrs, srcMemoryDescs, - dstMemoryDescs, *attr); + execPtrDeconvACL = selected_pd->getExecutorFactoryAs()->makeExecutor(deconvAttrs, + srcMemoryDescs, + dstMemoryDescs, + *attr); selected_pd->setImplementationType(execPtrDeconvACL->getImplType()); return; } @@ -892,7 +921,7 @@ void Deconvolution::prepareParams() { OPENVINO_THROW("Bias memory memory is undefined."); biasDesc = biasMemPtr->getDescWithType(); } - bool is1x1PaddingAsymmetric = false; + bool is1x1PaddingAsymmetric = false; if (externOutShape && (!isConstOutShape || isDynamicNode())) { // Check implicit asymmetric padding case for dynamic case and runtime output shape. is1x1PaddingAsymmetric = isImplicit1x1PaddingAsymmetric(getSrcMemoryAtPort(0)->getShape().getStaticDims()); @@ -918,34 +947,41 @@ void Deconvolution::prepareParams() { dnnl::memory::desc dnnlBiasDesc; const auto& weiDims = key.inp1->getShape().getStaticDims(); const auto srcDataType = key.inp0->getDataType(); - const auto weiDataType = (one_of(srcDataType, memory::data_type::s8, memory::data_type::u8)) ? - memory::data_type::s8 : srcDataType; + const auto weiDataType = + (one_of(srcDataType, memory::data_type::s8, memory::data_type::u8)) ? memory::data_type::s8 : srcDataType; auto wghDescAny = - dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(weiDims), - weiDataType, - memory::format_tag::any); + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(weiDims), weiDataType, memory::format_tag::any); if (key.bias) dnnlBiasDesc = key.bias->getDnnlDesc(); - desc = createDescriptorInternal(key.inp0->getDnnlDesc(), wghDescAny, dnnlBiasDesc, key.out->getDnnlDesc(), - key.bias != nullptr, key.stride, key.dilation, key.paddingL, key.paddingR, key.attr, engine); + desc = createDescriptorInternal(key.inp0->getDnnlDesc(), + wghDescAny, + dnnlBiasDesc, + key.out->getDnnlDesc(), + key.bias != nullptr, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + key.attr, + engine); primitive_desc_iterator itpd = desc; executorPtr execPtr = nullptr; while (static_cast(itpd)) { impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - //Skip the brgemm implemenation for asymmetric padding case because of the accuracy issue. + // Skip the brgemm implemenation for asymmetric padding case because of the accuracy issue. if (key.isImplicit1x1PaddingAsymmetric && (impl_type & impl_desc_type::brgconv)) continue; if (impl_type == key.implType) { auto prim_desc = deconvolution_forward::primitive_desc(itpd.get()); execPtr = std::make_shared(prim_desc, - key.inp0->getDnnlDesc(), - key.inp1->getDnnlDesc(), - key.out->getDnnlDesc(), - engine, - key.constWeight); + key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc(), + engine, + key.constWeight); break; } @@ -955,16 +991,27 @@ void Deconvolution::prepareParams() { } if (!execPtr) { - auto inDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp0->getShape().getStaticDims()), - key.inp0->getDataType(), - memory::format_tag::any); - auto outDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.out->getShape().getStaticDims()), - key.out->getDataType(), - memory::format_tag::any); + auto inDesc = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp0->getShape().getStaticDims()), + key.inp0->getDataType(), + memory::format_tag::any); + auto outDesc = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.out->getShape().getStaticDims()), + key.out->getDataType(), + memory::format_tag::any); dnnl::primitive_desc anyDeconvDesc; - anyDeconvDesc = createDescriptorInternal(inDesc, wghDescAny, dnnlBiasDesc, outDesc, key.bias != nullptr, - key.stride, key.dilation, key.paddingL, key.paddingR, key.attr, engine); + anyDeconvDesc = createDescriptorInternal(inDesc, + wghDescAny, + dnnlBiasDesc, + outDesc, + key.bias != nullptr, + key.stride, + key.dilation, + key.paddingL, + key.paddingR, + key.attr, + engine); if (anyDeconvDesc) { auto prim_desc = deconvolution_forward::primitive_desc(anyDeconvDesc.get()); execPtr = std::make_shared(prim_desc, @@ -984,13 +1031,12 @@ void Deconvolution::prepareParams() { auto cache = context->getParamsCache(); auto result = cache->getOrCreate(key, builder); - execPtr = result.first; if (!execPtr) OPENVINO_THROW("Primitive descriptor was not found for node ", getName(), "."); primArgs[DNNL_ARG_SRC] = srcMemPtr->getPrimitive(); - primArgs[DNNL_ARG_DST]= dstMemPtr->getPrimitive(); + primArgs[DNNL_ARG_DST] = dstMemPtr->getPrimitive(); if (weightIsConst) { // const weight preparation/reordering needs to be done once at next execution // when the input weight data is guaranteed to be ready (considering possible const-folding @@ -1018,8 +1064,8 @@ void Deconvolution::prepareParams() { #endif } -void Deconvolution::createDescriptor(const std::vector &inputDesc, - const std::vector &outputDesc) { +void Deconvolution::createDescriptor(const std::vector& inputDesc, + const std::vector& outputDesc) { auto inDesc = inputDesc[0]->isDefined() ? inputDesc[0] : inputDesc[0]->cloneWithNewDims(inShape.getStaticDims()); auto dnnlInDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(*inDesc); const auto& in_candidate = dnnlInDesc.getDnnlDesc(); @@ -1040,26 +1086,38 @@ void Deconvolution::createDescriptor(const std::vector &inputDesc AttrPtr attr = initPrimitiveAttr(); if (withBiases) { memory::data_type bdt = memory::data_type::f32; - bias_candidate = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); + bias_candidate = + dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(expectedBiasDims), bdt, memory::format_tag::any); } - dnnl::memory::desc wgh_candidate(DnnlExtensionUtils::convertToDnnlDims(dnnlCompatibleWeiDims), isInt8 ? memory::data_type::s8 : dnnlInDesc.getDataType(), - memory::format_tag::any); - descs.emplace_back(createDescriptorInternal(in_candidate, wgh_candidate, bias_candidate, - out_candidate, withBiases, deconvAttrs.stride, deconvAttrs.dilation, - deconvAttrs.paddingL, deconvAttrs.paddingR, *attr, getEngine())); + dnnl::memory::desc wgh_candidate(DnnlExtensionUtils::convertToDnnlDims(dnnlCompatibleWeiDims), + isInt8 ? memory::data_type::s8 : dnnlInDesc.getDataType(), + memory::format_tag::any); + descs.emplace_back(createDescriptorInternal(in_candidate, + wgh_candidate, + bias_candidate, + out_candidate, + withBiases, + deconvAttrs.stride, + deconvAttrs.dilation, + deconvAttrs.paddingL, + deconvAttrs.paddingR, + *attr, + getEngine())); } -std::shared_ptr Deconvolution::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { +std::shared_ptr Deconvolution::getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { if (idx == 2 && !withBiases) { - //Expected dest shape; + // Expected dest shape; return std::make_shared(ov::element::i32, Shape(getInputShapeAtPort(2).getStaticDims())); } else if (idx > 0) { // weight and bias are exposed with the planar layout. // we need to store 'weight' input as edge, - // because at this moment we can't simple replace internal blob with input, since we need to save weight data as is, but with different order - return std::make_shared(getOriginalInputPrecisionAtPort(idx), Shape(getInputShapeAtPort(idx).getStaticDims())); + // because at this moment we can't simple replace internal blob with input, since we need to save weight data as + // is, but with different order + return std::make_shared(getOriginalInputPrecisionAtPort(idx), + Shape(getInputShapeAtPort(idx).getStaticDims())); } - //idx =0 case + // idx =0 case auto desc = prim_desc.src_desc(idx); if (getInputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(desc, getInputShapeAtPort(idx)); @@ -1067,8 +1125,8 @@ std::shared_ptr Deconvolution::getSrcMemDesc(const dnnl::primitive_d return DnnlExtensionUtils::makeDescriptor(desc); } -std::shared_ptr Deconvolution::getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const { - auto desc = prim_desc.dst_desc(idx); +std::shared_ptr Deconvolution::getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const { + auto desc = prim_desc.dst_desc(idx); if (getOutputShapeAtPort(idx).isDynamic()) { return DnnlExtensionUtils::makeUndefinedDesc(desc, getOutputShapeAtPort(idx)); } @@ -1082,7 +1140,8 @@ ov::element::Type Deconvolution::getRuntimePrecision() const { for (size_t i = 0; i < std::min(getParentEdges().size(), inputsNumLimit); i++) { auto parentEdge = getParentEdgeAt(i); if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated) { - inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); + inputPrecisions.emplace_back( + DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); } } @@ -1090,11 +1149,12 @@ ov::element::Type Deconvolution::getRuntimePrecision() const { } Deconvolution::DeconvDNNLExecutor::DeconvDNNLExecutor(const dnnl::deconvolution_forward::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight) : DnnlExecutor(pd) { + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight) + : DnnlExecutor(pd) { if (inMemDesc != getDnnlSrcDesc()) { inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)}); } @@ -1112,7 +1172,7 @@ std::vector Deconvolution::readOutputSpatialDims() const { if (getParentEdges().size() < 3) { OPENVINO_THROW("Can't get output spatial dims. Inputs number = ", getParentEdges().size()); } - const auto &shapeMemPtr = getSrcMemoryAtPort(2); + const auto& shapeMemPtr = getSrcMemoryAtPort(2); if (!shapeMemPtr || !shapeMemPtr->isDefined()) { OPENVINO_THROW("'output_shape' input memory is undefined."); } @@ -1120,20 +1180,20 @@ std::vector Deconvolution::readOutputSpatialDims() const { if (shapeMemPtr->getStaticDims()[0] != spDimsNum) { OPENVINO_THROW("Can't read output spatial dims, beause 'output_shape' input has incorrect number of elements"); } - const int32_t *outShapePtr = shapeMemPtr->getDataAs(); + const int32_t* outShapePtr = shapeMemPtr->getDataAs(); std::vector outSpDims(outShapePtr, outShapePtr + shapeMemPtr->getStaticDims()[0]); return outSpDims; } bool Deconvolution::canFuseBias() const { - //ONEDNN deconvolution_fwd_t primitive can support bias fusing. but has different implementations. - //For the brgdeconv implementation in the deconv list, bias is implemented via JIT kernel. - //For the fall back ref implementation entry(previous conv_backward_data), bias is implemented via reference post-ops. - //It is difficult to recognize whether the deconv will run with brg or fall back to backwards data implementation on the fusing - //transformation stage. In the end, all the deconv should run with brg implement. - //And in model zoo only limited deconv has bias or other post-ops in IR. - //Based on above, enable the bias fusing for all deconv implementations. - return (externOutShape ? getParentEdges().size() == 3 : getParentEdges().size() == 2); + // ONEDNN deconvolution_fwd_t primitive can support bias fusing. but has different implementations. + // For the brgdeconv implementation in the deconv list, bias is implemented via JIT kernel. + // For the fall back ref implementation entry(previous conv_backward_data), bias is implemented via reference + // post-ops. It is difficult to recognize whether the deconv will run with brg or fall back to backwards data + // implementation on the fusing transformation stage. In the end, all the deconv should run with brg implement. And + // in model zoo only limited deconv has bias or other post-ops in IR. Based on above, enable the bias fusing for all + // deconv implementations. + return (externOutShape ? getParentEdges().size() == 3 : getParentEdges().size() == 2); } void Deconvolution::initSupportedPrimitiveDescriptors() { @@ -1144,7 +1204,7 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { VectorDims inDims, outDims; std::tie(inDims, outDims) = makeDummyInOutShape(); - auto tmpInShape = Shape(inDims); + auto tmpInShape = Shape(inDims); auto tmpOutShape = Shape(outDims); initPaddingR(tmpInShape, tmpOutShape); @@ -1155,18 +1215,19 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { config.outConfs.resize(getOriginalOutputsNumber()); config.inConfs[0].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(0))); config.inConfs[1].setMemDesc( - creatorsMap.at(weights_format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); + creatorsMap.at(weights_format) + ->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(1))); for (size_t i = 2; i < getParentEdges().size(); ++i) { config.inConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalInputPrecisionAtPort(0), getInputShapeAtPort(i))); } for (size_t i = 0; i < config.outConfs.size(); ++i) { config.outConfs[i].setMemDesc( - creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); + creatorsMap.at(format)->createSharedDesc(getOriginalOutputPrecisionAtPort(0), getOutputShapeAtPort(i))); } std::vector srcMemoryDescs; @@ -1180,8 +1241,11 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()->clone()); } - auto factory = std::make_shared(deconvAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getImplPriority())); + auto factory = + std::make_shared(deconvAttrs, + srcMemoryDescs, + dstMemoryDescs, + std::make_shared(context, getImplPriority())); supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::gemm_acl, factory); }; @@ -1189,7 +1253,6 @@ void Deconvolution::initSupportedPrimitiveDescriptors() { pushDesc(LayoutType::ncsp); } - -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/deconv.h b/src/plugins/intel_cpu/src/nodes/deconv.h index d94bcd8bcaca13..1c3e1fe8978918 100644 --- a/src/plugins/intel_cpu/src/nodes/deconv.h +++ b/src/plugins/intel_cpu/src/nodes/deconv.h @@ -29,27 +29,32 @@ class Deconvolution : public Node { return static_cast(getParentEdges().size()); } - std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; - std::shared_ptr getDstMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const override; + std::shared_ptr getSrcMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; + std::shared_ptr getDstMemDesc(const dnnl::primitive_desc& prim_desc, size_t idx) const override; ov::element::Type getRuntimePrecision() const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; bool canFuse(const NodePtr& node) const override; - const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); } - const std::vector& getStride() const { return deconvAttrs.stride; } + const VectorDims& getWeightDims() const { + return getInputShapeAtPort(1).getStaticDims(); + } + const std::vector& getStride() const { + return deconvAttrs.stride; + } void prepareParams() override; void execute(dnnl::stream strm) override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } bool needShapeInfer() const override; bool canFuseBias() const; bool canBeExecutedInInt8() const override; const std::vector& getDefaultImplPriority() override; - protected: AttrPtr initPrimitiveAttr() override; AttrPtr makePrimitiveAttr(const VectorDims& dims); @@ -60,13 +65,13 @@ class Deconvolution : public Node { using executorPtr = std::shared_ptr; executorPtr execPtr = nullptr; class DeconvDNNLExecutor : public DnnlExecutor { - public: - DeconvDNNLExecutor(const dnnl::deconvolution_forward::primitive_desc& pd, - const dnnl::memory::desc& inMemDesc, - const dnnl::memory::desc& weightMemDesc, - const dnnl::memory::desc& outMemDesc, - const dnnl::engine& engine, - bool constWeight); + public: + DeconvDNNLExecutor(const dnnl::deconvolution_forward::primitive_desc& pd, + const dnnl::memory::desc& inMemDesc, + const dnnl::memory::desc& weightMemDesc, + const dnnl::memory::desc& outMemDesc, + const dnnl::engine& engine, + bool constWeight); }; bool isImplicit1x1PaddingAsymmetric(const VectorDims& inputDims); @@ -79,8 +84,8 @@ class Deconvolution : public Node { size_t IC = 0; size_t OC = 0; std::vector lastOutputSpatialDims; - VectorDims dnnlCompatibleWeiDims {}; - VectorDims expectedBiasDims {}; + VectorDims dnnlCompatibleWeiDims{}; + VectorDims expectedBiasDims{}; bool useACL = false; DeconvAttrs deconvAttrs; @@ -93,9 +98,9 @@ class Deconvolution : public Node { MemoryPtr dnnlCompatibleWeights = nullptr; std::shared_ptr attr; - void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims); - VectorDims shapeInferInternal(const VectorDims &inDims, std::vector outSpDims) const; - void initPaddingR(const Shape &inShape, const Shape &outShape); + void setPostOps(dnnl::primitive_attr& attr, const VectorDims& dims); + VectorDims shapeInferInternal(const VectorDims& inDims, std::vector outSpDims) const; + void initPaddingR(const Shape& inShape, const Shape& outShape); std::vector readOutputSpatialDims() const; std::pair makeDummyInOutShape(); bool withBiases = false; @@ -110,6 +115,6 @@ class Deconvolution : public Node { bool isConstOutShape = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.cpp b/src/plugins/intel_cpu/src/nodes/def_conv.cpp index 0167a18673c444..7c5427d0def045 100644 --- a/src/plugins/intel_cpu/src/nodes/def_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/def_conv.cpp @@ -4,21 +4,20 @@ #include "def_conv.h" -#include +#include +#include +#include #include #include -#include -#include "openvino/core/parallel.hpp" -#include "memory_desc/dnnl_blocked_memory_desc.h" #include "common/primitive_hashing_utils.hpp" -#include "openvino/util/pp.hpp" - -#include "dnnl_types.h" -#include "dnnl_extension_utils.h" #include "cpu/x64/jit_generator.hpp" -#include +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "openvino/core/parallel.hpp" +#include "openvino/util/pp.hpp" using namespace dnnl; using namespace dnnl::impl; @@ -30,7 +29,7 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_def_conv_call_args, field) +# define GET_OFF(field) offsetof(jit_def_conv_call_args, field) template struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_generator { @@ -38,7 +37,9 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ constexpr static int sampledPointsPerPixel = DeformableConvolution::sampledPointsPerPixel; - explicit jit_uni_def_conv_kernel_f32(const jit_def_conv_params& jcp) : jit_uni_def_conv_kernel(jcp), jit_generator(jit_name()) {} + explicit jit_uni_def_conv_kernel_f32(const jit_def_conv_params& jcp) + : jit_uni_def_conv_kernel(jcp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -72,8 +73,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; const int vlen = cpu_isa_traits::vlen; using Ymm = const Xbyak::Ymm; @@ -113,18 +114,29 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Xbyak::Opmask ktail_mask = Xbyak::Opmask(2); - inline Xbyak::Address table_val(int index) - { return ptr[reg_table + index * vlen]; } + inline Xbyak::Address table_val(int index) { + return ptr[reg_table + index * vlen]; + } - inline Vmm get_vmm_ker(int idx) { return Vmm(idx + 0); } - inline Vmm get_vmm_src(int idx) { return Vmm(idx + 1); } - inline Vmm get_vmm_acc(int idx) { return Vmm(idx + jcp_.ur_w + 1); } - inline Ymm get_ymm_acc(int idx) { return Ymm(idx + jcp_.ur_w + 1); } - inline Xmm get_xmm_acc(int idx) { return Xmm(idx + jcp_.ur_w + 1); } + inline Vmm get_vmm_ker(int idx) { + return Vmm(idx + 0); + } + inline Vmm get_vmm_src(int idx) { + return Vmm(idx + 1); + } + inline Vmm get_vmm_acc(int idx) { + return Vmm(idx + jcp_.ur_w + 1); + } + inline Ymm get_ymm_acc(int idx) { + return Ymm(idx + jcp_.ur_w + 1); + } + inline Xmm get_xmm_acc(int idx) { + return Xmm(idx + jcp_.ur_w + 1); + } Xbyak::Label l_table; - inline void checkZeroWei(const Xbyak::Xmm &x1, Label &nullifyLabel) { + inline void checkZeroWei(const Xbyak::Xmm& x1, Label& nullifyLabel) { ptest(x1, x1); jz(nullifyLabel); } @@ -135,13 +147,16 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ mov(reg_ow_pos, 0); - L(ow_loop_main); { + L(ow_loop_main); + { cmp(reg_ow_pos, jcp_.ow - jcp_.ur_w); jg(ow_tail, T_NEAR); oc_loop(jcp_.ur_w); - add(reg_sampled_wei, jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_wei); // type = float - add(reg_sampled_offs, jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_offsets); // type = int + add(reg_sampled_wei, + jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_wei); // type = float + add(reg_sampled_offs, + jcp_.ur_w * jcp_.kh * jcp_.kw * sampledPointsPerPixel * jcp_.typesize_sampled_offsets); // type = int add(reg_output, jcp_.ur_w * jcp_.oc * jcp_.typesize_out); @@ -149,7 +164,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(ow_loop_main, T_NEAR); } - L(ow_tail); { + L(ow_tail); + { if (jcp_.ow % jcp_.ur_w != 0) oc_loop(jcp_.ow % jcp_.ur_w); } @@ -191,7 +207,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ for (int ic = 0; ic < ic_step; ic++) { for (int ow = 0; ow < ow_step; ow++) { Vmm vmm_src = get_vmm_src(ow); - size_t inp_off = (size_t) ow * jcp_.kh * jcp_.kw * jcp_.ic + kh * jcp_.kw * jcp_.ic + kw * jcp_.ic + ic; + size_t inp_off = + (size_t)ow * jcp_.kh * jcp_.kw * jcp_.ic + kh * jcp_.kw * jcp_.ic + kw * jcp_.ic + ic; uni_vbroadcastss(vmm_src, ptr[aux2_reg_input_buffer + inp_off * jcp_.typesize_in]); } @@ -199,10 +216,10 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ for (int r = 0; r < repeats; r++) { for (int ocb = 0; ocb < oc_blocks_step; ocb++) { Vmm vmm_ker = get_vmm_ker(0); - size_t ker_off = (size_t) ocb * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + - kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + - kw * jcp_.ic_block * jcp_.oc_block + - ic * jcp_.oc_block + r * jcp_.oc_block / 2; + size_t ker_off = + (size_t)ocb * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + + kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block + kw * jcp_.ic_block * jcp_.oc_block + + ic * jcp_.oc_block + r * jcp_.oc_block / 2; uni_vmovups(vmm_ker, ptr[aux2_reg_kernel + ker_off * jcp_.typesize_in]); for (int ow = 0; ow < ow_step; ow++) { @@ -248,7 +265,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ init_accums(ow_step, oc_blocks_step, oc_step); - L(ic_main_loop); { + L(ic_main_loop); + { cmp(reg_ic_iter, jcp_.ic_block); jl(ic_tail, T_NEAR); @@ -259,7 +277,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(ic_main_loop, T_NEAR); } - L(ic_tail); { + L(ic_tail); + { if (jcp_.ic % jcp_.ic_block != 0) { apply_filter(ow_step, oc_blocks_step, oc_step, jcp_.ic % jcp_.ic_block); } @@ -283,7 +302,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ xor_(reg_dg_iter, reg_dg_iter); const int ic_per_def_group = jcp_.ic / jcp_.dg; - L(dg_loop); { + L(dg_loop); + { cmp(reg_dg_iter, jcp_.dg); jge(dg_loop_end, T_NEAR); @@ -326,7 +346,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Xmm xmm_w4 = Xmm(5); Xmm xmm_v1 = Xmm(2); - Xmm xmm_v2 = Xmm(3);; + Xmm xmm_v2 = Xmm(3); + ; Xmm xmm_v3 = Xmm(6); Xmm xmm_v4 = Xmm(7); @@ -341,7 +362,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Vmm vmm_v4 = Vmm(xmm_v4.getIdx()); // offsets computation - size_t ind_off_hh = sampledPointsPerPixel * (((size_t) kh * jcp_.kw + kw) + ow * (jcp_.kh * jcp_.kw)); + size_t ind_off_hh = + sampledPointsPerPixel * (((size_t)kh * jcp_.kw + kw) + ow * (jcp_.kh * jcp_.kw)); size_t ind_off_hl = ind_off_hh + 1; size_t ind_off_lh = ind_off_hl + 1; size_t ind_off_ll = ind_off_lh + 1; @@ -366,12 +388,16 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jl(ic_loop_tail, T_NEAR); // check zero markers - uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v1, + dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v2, + dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v3, + dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v4, + dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); - size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; + size_t input_buffer_off = (size_t)kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; uni_vpmovsxdq(xmm_v1_off, xmm_v1_off); uni_vmovq(reg_tmp_64, xmm_v1_off); @@ -382,9 +408,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v1, vmm_v1, vmm_w1); jmp(nullify_v1_end, T_NEAR); L(nullify_v1); - { - uni_vpxor(vmm_v1, vmm_v1, vmm_v1); - } + { uni_vpxor(vmm_v1, vmm_v1, vmm_v1); } L(nullify_v1_end); uni_vpmovsxdq(xmm_v2_off, xmm_v2_off); @@ -396,9 +420,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v2, vmm_v2, vmm_w2); jmp(nullify_v2_end, T_NEAR); L(nullify_v2); - { - uni_vpxor(vmm_v2, vmm_v2, vmm_v2); - } + { uni_vpxor(vmm_v2, vmm_v2, vmm_v2); } L(nullify_v2_end); uni_vpmovsxdq(xmm_v3_off, xmm_v3_off); @@ -410,9 +432,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v3, vmm_v3, vmm_w3); jmp(nullify_v3_end, T_NEAR); L(nullify_v3); - { - uni_vpxor(vmm_v3, vmm_v3, vmm_v3); - } + { uni_vpxor(vmm_v3, vmm_v3, vmm_v3); } L(nullify_v3_end); uni_vpmovsxdq(xmm_v4_off, xmm_v4_off); @@ -424,9 +444,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulps(vmm_v4, vmm_v4, vmm_w4); jmp(nullify_v4_end, T_NEAR); L(nullify_v4); - { - uni_vpxor(vmm_v4, vmm_v4, vmm_v4); - } + { uni_vpxor(vmm_v4, vmm_v4, vmm_v4); } L(nullify_v4_end); uni_vaddps(vmm_v1, vmm_v1, vmm_v2); @@ -446,12 +464,16 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jl(loop_end, T_NEAR); // check zero markers - uni_vbroadcastss(xmm_v1, dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v2, dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v3, dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); - uni_vbroadcastss(xmm_v4, dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); - - size_t input_buffer_off = (size_t) kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; + uni_vbroadcastss(xmm_v1, + dword[aux_reg_sampled_wei + ind_off_ll * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v2, + dword[aux_reg_sampled_wei + ind_off_hl * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v3, + dword[aux_reg_sampled_wei + ind_off_lh * jcp_.typesize_sampled_wei]); + uni_vbroadcastss(xmm_v4, + dword[aux_reg_sampled_wei + ind_off_hh * jcp_.typesize_sampled_wei]); + + size_t input_buffer_off = (size_t)kh * jcp_.kw * jcp_.ic + kw * jcp_.ic; uni_vpmovsxdq(xmm_v1_off, xmm_v1_off); uni_vmovq(reg_tmp_64, xmm_v1_off); imul(reg_tmp_64, reg_tmp_64, jcp_.ic * jcp_.typesize_in); @@ -461,9 +483,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v1, xmm_v1, xmm_w1); jmp(nullify_v1_end_tail, T_NEAR); L(nullify_v1_tail); - { - uni_vpxor(xmm_v1, xmm_v1, xmm_v1); - } + { uni_vpxor(xmm_v1, xmm_v1, xmm_v1); } L(nullify_v1_end_tail); uni_vpmovsxdq(xmm_v2_off, xmm_v2_off); @@ -475,9 +495,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v2, xmm_v2, xmm_w2); jmp(nullify_v2_end_tail, T_NEAR); L(nullify_v2_tail); - { - uni_vpxor(xmm_v2, xmm_v2, xmm_v2); - } + { uni_vpxor(xmm_v2, xmm_v2, xmm_v2); } L(nullify_v2_end_tail); uni_vpmovsxdq(xmm_v3_off, xmm_v3_off); @@ -489,9 +507,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v3, xmm_v3, xmm_w3); jmp(nullify_v3_end_tail, T_NEAR); L(nullify_v3_tail); - { - uni_vpxor(xmm_v3, xmm_v3, xmm_v3); - } + { uni_vpxor(xmm_v3, xmm_v3, xmm_v3); } L(nullify_v3_end_tail); uni_vpmovsxdq(xmm_v4_off, xmm_v4_off); @@ -503,9 +519,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ uni_vmulss(xmm_v4, xmm_v4, xmm_w4); jmp(nullify_v4_end_tail, T_NEAR); L(nullify_v4_tail); - { - uni_vpxor(xmm_v4, xmm_v4, xmm_v4); - } + { uni_vpxor(xmm_v4, xmm_v4, xmm_v4); } L(nullify_v4_end_tail); uni_vaddss(xmm_v1, xmm_v1, xmm_v2); @@ -524,8 +538,10 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } } - add(aux_reg_sampled_wei, sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_wei); - add(aux_reg_sampled_offs, sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_offsets); + add(aux_reg_sampled_wei, + sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_wei); + add(aux_reg_sampled_offs, + sampledPointsPerPixel * jcp_.kh * jcp_.kw * jcp_.oh * jcp_.ow * jcp_.typesize_sampled_offsets); add(aux_reg_input, ic_per_def_group * jcp_.typesize_in); add(aux2_reg_input_buffer, ic_per_def_group * jcp_.typesize_in); inc(reg_dg_iter); @@ -542,7 +558,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ if (jcp_.with_bias) { for (int r = 0; r < repeats; r++) { for (int ocb = 0; ocb < oc_blocks_step; ocb++) { - size_t bias_off = (size_t) ocb * jcp_.oc_block + r * jcp_.oc_block / 2; + size_t bias_off = (size_t)ocb * jcp_.oc_block + r * jcp_.oc_block / 2; uni_vmovups(Vmm(0), ptr[aux_reg_bias + bias_off * jcp_.typesize_bia]); for (int ow = 0; ow < ow_step; ow++) { @@ -560,7 +576,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } for (int r = 0; r < repeats; r++) { - int tail_size = isa == cpu::x64::sse41 ? std::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; + int tail_size = + isa == cpu::x64::sse41 ? std::min(jcp_.oc_block / 2, oc_step - r * jcp_.oc_block / 2) : oc_step; bool is_scalar_store = isa == cpu::x64::sse41 ? tail_size < jcp_.oc_block / 2 : tail_size < jcp_.oc_block; if (is_scalar_store) { for (int ow = 0; ow < ow_step; ow++) { @@ -568,11 +585,11 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ Xmm xmm_dst = get_xmm_acc(r * jcp_.ur_w * jcp_.nb_oc_blocking + ow); if (isa == avx512_core) { - size_t out_off = (size_t) ow * jcp_.oc; + size_t out_off = (size_t)ow * jcp_.oc; uni_vmovups(ptr[aux_reg_output + out_off * jcp_.typesize_out], vmm_dst | ktail_mask); } else { for (int oc = 0; oc < tail_size; oc++) { - size_t out_off = (size_t) ow * jcp_.oc + oc + r * (jcp_.oc_block / 2); + size_t out_off = (size_t)ow * jcp_.oc + oc + r * (jcp_.oc_block / 2); uni_vmovq(reg_tmp_64, xmm_dst); mov(ptr[aux_reg_output + out_off * jcp_.typesize_out], reg_tmp_32); @@ -593,7 +610,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ for (int ocb = 0; ocb < oc_blocks_step; ocb++) { for (int ow = 0; ow < ow_step; ow++) { Vmm vmm_acc = get_vmm_acc(r * jcp_.ur_w * jcp_.nb_oc_blocking + ocb * ow_step + ow); - size_t out_off = (size_t) ow * jcp_.oc * jcp_.ngroups + ocb * jcp_.oc_block + r * (jcp_.oc_block / 2); + size_t out_off = + (size_t)ow * jcp_.oc * jcp_.ngroups + ocb * jcp_.oc_block + r * (jcp_.oc_block / 2); uni_vmovups(ptr[aux_reg_output + out_off * jcp_.typesize_out], vmm_acc); } } @@ -629,14 +647,17 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ mov(aux_reg_bias, reg_bias); mov(reg_oc_work, jcp_.oc); - L(oc_unrolled_loop); { + L(oc_unrolled_loop); + { cmp(reg_oc_work, jcp_.nb_oc_blocking * jcp_.oc_block); jl(oc_main_loop, T_NEAR); ic_loop(ow_step, jcp_.nb_oc_blocking, jcp_.oc_block); store_output(ow_step, jcp_.nb_oc_blocking, jcp_.oc_block); - add(aux_reg_kernel, jcp_.nb_oc_blocking * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block * jcp_.typesize_in); + add(aux_reg_kernel, + jcp_.nb_oc_blocking * jcp_.nb_ic * jcp_.kh * jcp_.kw * jcp_.ic_block * jcp_.oc_block * + jcp_.typesize_in); add(aux_reg_output, jcp_.nb_oc_blocking * jcp_.oc_block * jcp_.typesize_out); add(aux_reg_bias, jcp_.nb_oc_blocking * jcp_.oc_block * jcp_.typesize_bia); sub(reg_oc_work, jcp_.nb_oc_blocking * jcp_.oc_block); @@ -644,7 +665,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(oc_unrolled_loop, T_NEAR); } - L(oc_main_loop); { + L(oc_main_loop); + { cmp(reg_oc_work, jcp_.oc_block); jl(oc_tail, T_NEAR); @@ -659,7 +681,8 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ jmp(oc_main_loop, T_NEAR); } - L(oc_tail); { + L(oc_tail); + { if (jcp_.oc % jcp_.oc_block != 0) { ic_loop(ow_step, 1, jcp_.oc % jcp_.oc_block); store_output(ow_step, 1, jcp_.oc % jcp_.oc_block); @@ -672,11 +695,12 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ } }; #endif -bool DeformableConvolution::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool DeformableConvolution::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (!one_of(op->get_type_info(), - ov::op::v1::DeformableConvolution::get_type_info_static(), - ov::op::v8::DeformableConvolution::get_type_info_static())) { + ov::op::v1::DeformableConvolution::get_type_info_static(), + ov::op::v8::DeformableConvolution::get_type_info_static())) { errorMessage = "Node is not an instance of DeformableConvolution form the operation set v1 or v8."; return false; } @@ -721,16 +745,16 @@ size_t DefConvKey::hash() const { return seed; } -bool DefConvKey::operator==(const DefConvKey &rhs) const { +bool DefConvKey::operator==(const DefConvKey& rhs) const { bool retVal = true; for (size_t i = 0; i < descVector.size(); i++) { if (descVector[i] != rhs.descVector[i]) { retVal = retVal && descVector[i] && rhs.descVector[i] && - descVector[i]->getBlockDims() == rhs.descVector[i]->getBlockDims() && - descVector[i]->getStrides() == rhs.descVector[i]->getStrides() && - descVector[i]->getOrder() == rhs.descVector[i]->getOrder() && - descVector[i]->getOffsetPaddingToData() == rhs.descVector[i]->getOffsetPaddingToData() && - descVector[i]->getOffsetPadding() == rhs.descVector[i]->getOffsetPadding(); + descVector[i]->getBlockDims() == rhs.descVector[i]->getBlockDims() && + descVector[i]->getStrides() == rhs.descVector[i]->getStrides() && + descVector[i]->getOrder() == rhs.descVector[i]->getOrder() && + descVector[i]->getOffsetPaddingToData() == rhs.descVector[i]->getOffsetPaddingToData() && + descVector[i]->getOffsetPadding() == rhs.descVector[i]->getOffsetPadding(); } } @@ -742,7 +766,7 @@ bool DefConvKey::operator==(const DefConvKey &rhs) const { return retVal; } -} // namespace +} // namespace DeformableConvolution::DeformableConvolution(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op)) { @@ -825,13 +849,14 @@ void DeformableConvolution::initSupportedPrimitiveDescriptors() { impl_desc_type impl_type; const int simd_w = mayiuse(cpu::x64::avx512_core) ? 16 : 8; - auto &weiDims = getInputShapeAtPort(WEI_ID).getDims(); + auto& weiDims = getInputShapeAtPort(WEI_ID).getDims(); if (weiDims[1] == Shape::UNDEFINED_DIM || weiDims[0] == Shape::UNDEFINED_DIM || // 1. strict fallback, until devising of multigroup handling in common case defConvAttr.group != 1 || // 2. common fallback, except specific n_group / n_channel combinations - (defConvAttr.group != 1 && ((weiDims[1] % simd_w != 0) // in_channels_per_gr !% simd_w - || ((weiDims[0] / defConvAttr.group) % simd_w != 0)))) { // out_channels_per_gr !% simd_w + (defConvAttr.group != 1 && + ((weiDims[1] % simd_w != 0) // in_channels_per_gr !% simd_w + || ((weiDims[0] / defConvAttr.group) % simd_w != 0)))) { // out_channels_per_gr !% simd_w enforceRef = true; } else { enforceRef = false; @@ -854,41 +879,48 @@ void DeformableConvolution::initSupportedPrimitiveDescriptors() { auto dataFormat = memory::format_tag::nhwc; auto offFormat = memory::format_tag::nchw; auto weiFormat = mayiuse(avx512_core) ? memory::format_tag::OIhw16i16o : memory::format_tag::OIhw8i8o; - config.inConfs[DATA_ID].setMemDesc(std::make_shared(getInputShapeAtPort(DATA_ID), - memory::data_type::f32, dataFormat)); - config.inConfs[OFF_ID].setMemDesc(std::make_shared(getInputShapeAtPort(OFF_ID), - memory::data_type::f32, offFormat)); + config.inConfs[DATA_ID].setMemDesc( + std::make_shared(getInputShapeAtPort(DATA_ID), memory::data_type::f32, dataFormat)); + config.inConfs[OFF_ID].setMemDesc( + std::make_shared(getInputShapeAtPort(OFF_ID), memory::data_type::f32, offFormat)); - config.inConfs[WEI_ID].setMemDesc(std::make_shared(getInputShapeAtPort(WEI_ID), - memory::data_type::f32, weiFormat)); + config.inConfs[WEI_ID].setMemDesc( + std::make_shared(getInputShapeAtPort(WEI_ID), memory::data_type::f32, weiFormat)); if (inputsNumber > 3) { config.inConfs[MOD_ID].setMemDesc(std::make_shared(getInputShapeAtPort(MOD_ID), - memory::data_type::f32, memory::format_tag::nchw)); + memory::data_type::f32, + memory::format_tag::nchw)); } - config.outConfs[0].setMemDesc(std::make_shared(getOutputShapeAtPort(DATA_ID), - memory::data_type::f32, dataFormat)); + config.outConfs[0].setMemDesc( + std::make_shared(getOutputShapeAtPort(DATA_ID), memory::data_type::f32, dataFormat)); supportedPrimitiveDescriptors.push_back({config, impl_type}); } else { // reference implementation - config.inConfs[DATA_ID].setMemDesc(std::make_shared(getInputShapeAtPort(DATA_ID), memory::data_type::f32, + config.inConfs[DATA_ID].setMemDesc(std::make_shared(getInputShapeAtPort(DATA_ID), + memory::data_type::f32, memory::format_tag::nchw)); - config.inConfs[OFF_ID].setMemDesc(std::make_shared(getInputShapeAtPort(OFF_ID), memory::data_type::f32, + config.inConfs[OFF_ID].setMemDesc(std::make_shared(getInputShapeAtPort(OFF_ID), + memory::data_type::f32, memory::format_tag::nchw)); - config.inConfs[WEI_ID].setMemDesc(std::make_shared(getInputShapeAtPort(WEI_ID), memory::data_type::f32, + config.inConfs[WEI_ID].setMemDesc(std::make_shared(getInputShapeAtPort(WEI_ID), + memory::data_type::f32, memory::format_tag::oihw)); if (inputsNumber > 3) { - config.inConfs[MOD_ID].setMemDesc(std::make_shared(getInputShapeAtPort(MOD_ID), memory::data_type::f32, + config.inConfs[MOD_ID].setMemDesc(std::make_shared(getInputShapeAtPort(MOD_ID), + memory::data_type::f32, memory::format_tag::nchw)); } - config.outConfs[0].setMemDesc(std::make_shared(getOutputShapeAtPort(DATA_ID), memory::data_type::f32, + config.outConfs[0].setMemDesc(std::make_shared(getOutputShapeAtPort(DATA_ID), + memory::data_type::f32, memory::format_tag::nchw)); supportedPrimitiveDescriptors.push_back({config, impl_type}); } } -void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( - const float* offsets, const float* modulation, bool enforceRef) { +void DeformableConvolution::DefConvExecutor::prepareSamplingWeights(const float* offsets, + const float* modulation, + bool enforceRef) { const int MB = jcp.mb; const int OH = jcp.oh; const int OW = jcp.ow; @@ -918,45 +950,45 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( const int h_in = oh * KSH - padT; const int w_in = ow * KSW - padL; - const float *data_offset_ptr = offsets + mb * offStrides[0] + (dg * 2 * KH * KW) * offStrides[1]; - const float *modulation_offset_ptr = nullptr; + const float* data_offset_ptr = offsets + mb * offStrides[0] + (dg * 2 * KH * KW) * offStrides[1]; + const float* modulation_offset_ptr = nullptr; if (modulation != nullptr) { modulation_offset_ptr = modulation + mb * modStrides[0] + (dg * ker_size) * modStrides[1]; } for (int kh = 0; kh < KH; kh++) { for (int kw = 0; kw < KW; kw++) { - const size_t data_offset_h_index = 2 * ((size_t) kh * KW + kw) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; - const size_t data_offset_w_index = (2 * ((size_t) kh * KW + kw) + 1) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; + const size_t data_offset_h_index = + 2 * ((size_t)kh * KW + kw) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; + const size_t data_offset_w_index = + (2 * ((size_t)kh * KW + kw) + 1) * offStrides[1] + oh * offStrides[2] + ow * offStrides[3]; const float offset_h = data_offset_ptr[data_offset_h_index]; const float offset_w = data_offset_ptr[data_offset_w_index]; float map_h = h_in + kh * (KDH + 1) + offset_h; float map_w = w_in + kw * (KDW + 1) + offset_w; bool skip_compute; if (with_bi_pad) { - skip_compute = !(static_cast(map_w) > -1 && - static_cast(map_w) < IW && - static_cast(map_h) > -1 && - static_cast(map_h) < IH); + skip_compute = !(static_cast(map_w) > -1 && static_cast(map_w) < IW && + static_cast(map_h) > -1 && static_cast(map_h) < IH); } else { - skip_compute = !(map_w >= 0 && map_w < IW && - map_h >= 0 && map_h < IH); + skip_compute = !(map_w >= 0 && map_w < IW && map_h >= 0 && map_h < IH); } if (!skip_compute) { // modulations precomp. float modulation_scalar = 1.0f; if (modulation_offset_ptr != nullptr) { - size_t modulation_index = (kh * KW + kw) * modStrides[1] + oh * modStrides[2] + ow * modStrides[3]; + size_t modulation_index = + (kh * KW + kw) * modStrides[1] + oh * modStrides[2] + ow * modStrides[3]; modulation_scalar = modulation_offset_ptr[modulation_index]; } // interpolation precomp. const int cur_h_end = IH; const int cur_w_end = IW; - int h_low = with_bi_pad ? static_cast(floorf(map_h)) : - std::max(static_cast(floorf(map_h)), 0); - int w_low = with_bi_pad ? static_cast(floorf(map_w)) : - std::max(static_cast(floorf(map_w)), 0); + int h_low = + with_bi_pad ? static_cast(floorf(map_h)) : std::max(static_cast(floorf(map_h)), 0); + int w_low = + with_bi_pad ? static_cast(floorf(map_w)) : std::max(static_cast(floorf(map_w)), 0); int h_high = with_bi_pad ? h_low + 1 : std::min(static_cast(ceilf(map_h)), cur_h_end - 1); int w_high = with_bi_pad ? w_low + 1 : std::min(static_cast(ceilf(map_w)), cur_w_end - 1); @@ -976,7 +1008,7 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( const int h_off_low = h_ind_low * (srcStrides[2] / srcStrides[3]); const int h_off_high = h_ind_high * (srcStrides[2] / srcStrides[3]); - const int w_off_low = w_ind_low; + const int w_off_low = w_ind_low; const int w_off_high = w_ind_high; pSampledCoordsVector[sampledCoordIndex] = h_off_high + w_off_high; pSampledCoordsVector[sampledCoordIndex + 1] = h_off_high + w_off_low; @@ -984,7 +1016,7 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( pSampledCoordsVector[sampledCoordIndex + 3] = h_off_low + w_off_low; float w22 = hh * hw * modulation_scalar, w21 = hh * lw * modulation_scalar, - w12 = lh * hw * modulation_scalar, w11 = lh * lw * modulation_scalar; + w12 = lh * hw * modulation_scalar, w11 = lh * lw * modulation_scalar; pInterpWeightsVector[sampledCoordIndex] = w11; pInterpWeightsVector[sampledCoordIndex + 1] = w12; @@ -1007,15 +1039,16 @@ void DeformableConvolution::DefConvExecutor::prepareSamplingWeights( }); } -DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector) { +DeformableConvolution::DefConvExecutor::DefConvExecutor( + const DefConvAttr& defConvAttr, + const std::vector>& descVector) { if (descVector.size() != 4 && descVector.size() != 5) { OPENVINO_THROW("Deformable Convolution executor got incorrect desc's count (", descVector.size(), ")"); } bool withModulation = descVector.size() == 5; - auto &srcDesc = descVector[DATA_ID]; - auto &dstDesc = descVector[descVector.size() - 1]; + auto& srcDesc = descVector[DATA_ID]; + auto& dstDesc = descVector[descVector.size() - 1]; srcStrides = std::vector(srcDesc->getStrides().size()); offStrides = descVector[OFF_ID]->getStrides(); weiStrides = descVector[WEI_ID]->getStrides(); @@ -1085,9 +1118,10 @@ DeformableConvolution::DefConvExecutor::DefConvExecutor(const DefConvAttr &defCo jcp.nthr = dnnl_get_max_threads(); } -DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector) : - DefConvExecutor(defConvAttr, descVector) { +DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor( + const DefConvAttr& defConvAttr, + const std::vector>& descVector) + : DefConvExecutor(defConvAttr, descVector) { #if defined(OPENVINO_ARCH_X86_64) if (mayiuse(cpu::x64::avx512_core)) { def_conv_kernel.reset(new jit_uni_def_conv_kernel_f32(jcp)); @@ -1106,9 +1140,13 @@ DeformableConvolution::DefConvJitExecutor::DefConvJitExecutor(const DefConvAttr #endif } -void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) { +void DeformableConvolution::DefConvRefExecutor::exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) { this->pSampledCoordsVector = pSampledCoordsVector; this->pInterpWeightsVector = pInterpWeightsVector; prepareSamplingWeights(offsets, modulation, true); @@ -1133,17 +1171,18 @@ void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const flo auto compKer = [OV_CAPTURE_CPY_AND_THIS](int g, int mb, int oc, int oh, int ow) { float d = 0; for (int ic = 0; ic < IC; ic++) { - const float *data_im_ptr = src + mb * srcStrides[0] + (g * IC + ic) * srcStrides[1]; + const float* data_im_ptr = src + mb * srcStrides[0] + (g * IC + ic) * srcStrides[1]; const int deformable_group_index = (IC * g + ic) / channel_per_deformable_group; - int sampledCoordIndex = (mb * DGHW + deformable_group_index * HW + oh * OW + ow) * ker_size * sampledPointsPerPixel; - size_t weiIndex = (size_t) g * group_wei_stride + oc * weiStrides[0] + ic * weiStrides[1]; + int sampledCoordIndex = + (mb * DGHW + deformable_group_index * HW + oh * OW + ow) * ker_size * sampledPointsPerPixel; + size_t weiIndex = (size_t)g * group_wei_stride + oc * weiStrides[0] + ic * weiStrides[1]; for (size_t kh_off = 0; kh_off < KH * weiStrides[2]; kh_off += weiStrides[2]) { for (size_t kw_off = 0; kw_off < KW * weiStrides[3]; kw_off += weiStrides[3]) { // check if current addendum marked as equal zero if (pSampledCoordsVector[sampledCoordIndex] != -1) { const int v11 = pSampledCoordsVector[sampledCoordIndex]; const int v12 = pSampledCoordsVector[sampledCoordIndex + 1]; - const int v21 = pSampledCoordsVector[sampledCoordIndex + 2]; + const int v21 = pSampledCoordsVector[sampledCoordIndex + 2]; const int v22 = pSampledCoordsVector[sampledCoordIndex + 3]; float val = 0; @@ -1174,8 +1213,9 @@ void DeformableConvolution::DefConvRefExecutor::exec(const float* src, const flo }; parallel_nd(G, MB, OC, OH, OW, [&](dnnl_dim_t g, dnnl_dim_t mb, dnnl_dim_t oc, dnnl_dim_t oh, dnnl_dim_t ow) { - dst[mb * dstStrides[0] + (g * OC + oc) * dstStrides[1] + oh * dstStrides[2] + ow * dstStrides[3]] = compKer(g, mb, oc, oh, ow); - }); + dst[mb * dstStrides[0] + (g * OC + oc) * dstStrides[1] + oh * dstStrides[2] + ow * dstStrides[3]] = + compKer(g, mb, oc, oh, ow); + }); } void DeformableConvolution::prepareParams() { @@ -1208,22 +1248,17 @@ void DeformableConvolution::prepareParams() { updatePadding(); - std::vector> descVector { + std::vector> descVector{ getParentEdgeAt(DATA_ID)->getMemory().getDescWithType(), getParentEdgeAt(OFF_ID)->getMemory().getDescWithType(), - getParentEdgeAt(WEI_ID)->getMemory().getDescWithType() - }; + getParentEdgeAt(WEI_ID)->getMemory().getDescWithType()}; if (withModulation) { descVector.push_back(getParentEdgeAt(MOD_ID)->getMemory().getDescWithType()); } descVector.push_back(getChildEdgeAt(0)->getMemory().getDescWithType()); - DefConvKey key = { - descVector, - defConvAttr, - getSelectedPrimitiveDescriptor()->getImplementationType() - }; + DefConvKey key = {descVector, defConvAttr, getSelectedPrimitiveDescriptor()->getImplementationType()}; const int MB = getParentEdgeAt(DATA_ID)->getMemory().getStaticDims()[0]; const int OH = getChildEdgeAt(0)->getMemory().getStaticDims()[2]; @@ -1241,7 +1276,7 @@ void DeformableConvolution::prepareParams() { execPtr = nullptr; auto cache = context->getParamsCache(); - auto result = cache->getOrCreate(key, [] (const DefConvKey& key) -> std::shared_ptr { + auto result = cache->getOrCreate(key, [](const DefConvKey& key) -> std::shared_ptr { if (key.implType == impl_desc_type::ref) { return std::make_shared(key.defConvAttr, key.descVector); } @@ -1258,9 +1293,13 @@ void DeformableConvolution::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -void DeformableConvolution::DefConvJitExecutor::exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) { +void DeformableConvolution::DefConvJitExecutor::exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) { this->pSampledCoordsVector = pSampledCoordsVector; this->pInterpWeightsVector = pInterpWeightsVector; prepareSamplingWeights(offsets, modulation, false); @@ -1276,9 +1315,11 @@ void DeformableConvolution::DefConvJitExecutor::exec(const float* src, const flo const size_t _oc = g * jcp.nb_oc; const size_t _ic = g * jcp.nb_ic; - par_conv.src = &src[n * srcStrides[0] + _ic*jcp.ic_block * srcStrides[1]]; - par_conv.sampledWei = &(pInterpWeightsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); - par_conv.sampledCoords = &(pSampledCoordsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); + par_conv.src = &src[n * srcStrides[0] + _ic * jcp.ic_block * srcStrides[1]]; + par_conv.sampledWei = + &(pInterpWeightsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); + par_conv.sampledCoords = + &(pSampledCoordsVector[(n * jcp.dg * jcp.oh + oh) * jcp.kh * jcp.kw * jcp.ow * sampledPointsPerPixel]); par_conv.filt = &weights[g * jcp.nb_oc * jcp.nb_ic * jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block]; par_conv.dst = &dst[n * dstStrides[0] + _oc * jcp.oc_block * dstStrides[1] + oh * dstStrides[2]]; par_conv.buf = input_buffer_ptr + ithr * jcp.ur_w * jcp.kh * jcp.kw * jcp.ic; @@ -1292,20 +1333,20 @@ void DeformableConvolution::DefConvJitExecutor::exec(const float* src, const flo void DeformableConvolution::execute(dnnl::stream strm) { const size_t inputsNumber = getOriginalInputsNumber(); - auto &srcMemory0 = getParentEdgeAt(0)->getMemory(); - auto &srcMemory1 = getParentEdgeAt(1)->getMemory(); - auto &srcMemory2 = getParentEdgeAt(2)->getMemory(); - auto &dstMemory = getChildEdgeAt(0)->getMemory(); + auto& srcMemory0 = getParentEdgeAt(0)->getMemory(); + auto& srcMemory1 = getParentEdgeAt(1)->getMemory(); + auto& srcMemory2 = getParentEdgeAt(2)->getMemory(); + auto& dstMemory = getChildEdgeAt(0)->getMemory(); - const auto *src = srcMemory0.getDataAs(); - const auto *offsets = srcMemory1.getDataAs(); - const auto *weights = srcMemory2.getDataAs(); + const auto* src = srcMemory0.getDataAs(); + const auto* offsets = srcMemory1.getDataAs(); + const auto* weights = srcMemory2.getDataAs(); float* modulation = nullptr; if (inputsNumber > 3) { modulation = getSrcDataAtPortAs(3); } - float *dst = dstMemory.getDataAs(); + float* dst = dstMemory.getDataAs(); auto selectedPrimitiveDescriptor = getSelectedPrimitiveDescriptor(); if (!selectedPrimitiveDescriptor) @@ -1333,6 +1374,6 @@ ov::element::Type DeformableConvolution::getRuntimePrecision() const { return getMaxPrecision(getInputPrecisions()); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/def_conv.h b/src/plugins/intel_cpu/src/nodes/def_conv.h index 127fd00eb2bf00..ed5800a19a0e84 100644 --- a/src/plugins/intel_cpu/src/nodes/def_conv.h +++ b/src/plugins/intel_cpu/src/nodes/def_conv.h @@ -5,6 +5,7 @@ #pragma once #include + #include #include #include @@ -43,20 +44,20 @@ struct jit_def_conv_params { }; struct jit_def_conv_call_args { - const void *src; - const void *sampledWei; - const void *sampledCoords; - const void *filt; - const void *bias; - const void *dst; - const void *buf; + const void* src; + const void* sampledWei; + const void* sampledCoords; + const void* filt; + const void* bias; + const void* dst; + const void* buf; size_t oh_pos; }; struct jit_uni_def_conv_kernel { - void (*ker_)(const jit_def_conv_call_args *); + void (*ker_)(const jit_def_conv_call_args*); - void operator()(const jit_def_conv_call_args *args) { + void operator()(const jit_def_conv_call_args* args) { assert(ker_); ker_(args); } @@ -109,53 +110,66 @@ class DeformableConvolution : public Node { static constexpr size_t MOD_ID = 3; std::string errorPrefix; class DefConvExecutor { - public: - DefConvExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector); - - virtual void exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) = 0; - virtual ~DefConvExecutor() = default; - - protected: - void prepareSamplingWeights(const float* offsets, const float* modulation = nullptr, bool enforceRef = false); - jit_def_conv_params jcp = {}; - VectorDims srcStrides; - VectorDims offStrides; - VectorDims weiStrides; - VectorDims modStrides; - VectorDims dstStrides; - int *pSampledCoordsVector; - float *pInterpWeightsVector; + public: + DefConvExecutor(const DefConvAttr& defConvAttr, + const std::vector>& descVector); + + virtual void exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) = 0; + virtual ~DefConvExecutor() = default; + + protected: + void prepareSamplingWeights(const float* offsets, const float* modulation = nullptr, bool enforceRef = false); + jit_def_conv_params jcp = {}; + VectorDims srcStrides; + VectorDims offStrides; + VectorDims weiStrides; + VectorDims modStrides; + VectorDims dstStrides; + int* pSampledCoordsVector; + float* pInterpWeightsVector; }; class DefConvRefExecutor : public DefConvExecutor { - public: - DefConvRefExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector) : - DefConvExecutor(defConvAttr, descVector) {} - - void exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) override; + public: + DefConvRefExecutor(const DefConvAttr& defConvAttr, + const std::vector>& descVector) + : DefConvExecutor(defConvAttr, descVector) {} + + void exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) override; }; class DefConvJitExecutor : public DefConvExecutor { - std::shared_ptr def_conv_kernel = nullptr; - public: - DefConvJitExecutor(const DefConvAttr &defConvAttr, - const std::vector> &descVector); - - void exec(const float* src, const float* offsets, - const float* weights, const float* modulation, float* dst, - int *pSampledCoordsVector, float *pInterpWeightsVector) override; + std::shared_ptr def_conv_kernel = nullptr; + + public: + DefConvJitExecutor(const DefConvAttr& defConvAttr, + const std::vector>& descVector); + + void exec(const float* src, + const float* offsets, + const float* weights, + const float* modulation, + float* dst, + int* pSampledCoordsVector, + float* pInterpWeightsVector) override; }; std::shared_ptr execPtr = nullptr; bool autoPadding = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp b/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp index eb3789068adca1..a8629ce2592d76 100644 --- a/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp +++ b/src/plugins/intel_cpu/src/nodes/depth_to_space.cpp @@ -4,16 +4,15 @@ #include "depth_to_space.h" -#include "dnnl_extension_utils.h" -#include "utils/general_utils.h" - #include -#include "common/primitive_hashing_utils.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "openvino/opsets/opset1.hpp" #include #include "common/blocked_desc_creator.h" +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "dnnl_extension_utils.h" +#include "openvino/opsets/opset1.hpp" +#include "utils/general_utils.h" #define THROW_ERROR(...) OPENVINO_THROW("DepthToSpace layer with name '", getName(), "' ", __VA_ARGS__) @@ -40,9 +39,8 @@ size_t DepthToSpace::DepthToSpaceAttrs::hash() const { } bool DepthToSpace::DepthToSpaceAttrs::operator==(const DepthToSpaceAttrs& rhs) const { - bool result = layoutType == rhs.layoutType && mode == rhs.mode && - blockSize == rhs.blockSize && blockStep == rhs.blockStep && - dataSize == rhs.dataSize && nSpatialDims == rhs.nSpatialDims && + bool result = layoutType == rhs.layoutType && mode == rhs.mode && blockSize == rhs.blockSize && + blockStep == rhs.blockStep && dataSize == rhs.dataSize && nSpatialDims == rhs.nSpatialDims && srcBlockedDims == rhs.srcBlockedDims; return result; @@ -56,7 +54,9 @@ bool DepthToSpace::isSupportedOperation(const std::shared_ptr& o return false; } const auto mode = depthToSpace->get_mode(); - if (!one_of(mode, ov::op::v0::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, ov::op::v0::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST)) { + if (!one_of(mode, + ov::op::v0::DepthToSpace::DepthToSpaceMode::BLOCKS_FIRST, + ov::op::v0::DepthToSpace::DepthToSpaceMode::DEPTH_FIRST)) { errorMessage = "Does not support mode: " + ov::as_string(mode); return false; } @@ -138,7 +138,8 @@ void DepthToSpace::initSupportedPrimitiveDescriptors() { if (inputDataShape.getRank() > 2) { const auto& srcDims = inputDataShape.getDims(); auto canUseBlocked = [OV_CAPTURE_CPY_AND_THIS](const size_t block) { - return srcDims[1] != Shape::UNDEFINED_DIM && srcDims[1] % block == 0 && (srcDims[1] / block) % attrs.blockStep == 0 && + return srcDims[1] != Shape::UNDEFINED_DIM && srcDims[1] % block == 0 && + (srcDims[1] / block) % attrs.blockStep == 0 && (attrs.mode == Mode::DEPTH_FIRST ? block % attrs.blockStep == 0 : true); }; @@ -172,9 +173,10 @@ void DepthToSpace::createPrimitive() { const auto& memoryDesc = srcMemPtr->getDesc(); attrs.dataSize = memoryDesc.getPrecision().size(); attrs.nSpatialDims = memoryDesc.getShape().getRank() - 2; - attrs.layoutType = memoryDesc.hasLayoutType(LayoutType::nCsp16c) ? LayoutType::nCsp16c : - memoryDesc.hasLayoutType(LayoutType::nCsp8c) ? LayoutType::nCsp8c : - memoryDesc.hasLayoutType(LayoutType::nspc) ? LayoutType::nspc : LayoutType::ncsp; + attrs.layoutType = memoryDesc.hasLayoutType(LayoutType::nCsp16c) ? LayoutType::nCsp16c + : memoryDesc.hasLayoutType(LayoutType::nCsp8c) ? LayoutType::nCsp8c + : memoryDesc.hasLayoutType(LayoutType::nspc) ? LayoutType::nspc + : LayoutType::ncsp; if (inputShapesDefined()) { if (needPrepareParams()) @@ -205,7 +207,8 @@ DepthToSpace::DepthToSpaceExecutor::DepthToSpaceExecutor(const DepthToSpaceAttrs const bool isBlocked = one_of(attrs.layoutType, LayoutType::nCsp16c, LayoutType::nCsp8c); const bool isChannelsFirst = attrs.layoutType == LayoutType::nspc; const size_t nDims = attrs.srcBlockedDims.size(); - const size_t reshapedRank = nDims + attrs.nSpatialDims + static_cast(isBlocked && attrs.mode == Mode::DEPTH_FIRST); + const size_t reshapedRank = + nDims + attrs.nSpatialDims + static_cast(isBlocked && attrs.mode == Mode::DEPTH_FIRST); const size_t lastIdx = reshapedRank - 1; size_t firstSpatialOrder = 2; @@ -219,21 +222,24 @@ DepthToSpace::DepthToSpaceExecutor::DepthToSpaceExecutor(const DepthToSpaceAttrs params.src_block_dims[0] = attrs.srcBlockedDims[0]; // reshaping of src dimensions and creating the permutation order for each layout: - // new shape: mode = blocks_first [N, block_size, block_size, ..., block_size, C / (block_size ^ K), D1, D2, ..., DK] - // mode = depth_first [N, C / (block_size ^ K), block_size, block_size, ..., block_size, D1, D2, ..., DK] + // new shape: mode = blocks_first [N, block_size, block_size, ..., block_size, C / (block_size ^ K), D1, D2, ..., + // DK] + // mode = depth_first [N, C / (block_size ^ K), block_size, block_size, ..., block_size, D1, D2, ..., + // DK] // order : mode = blocks_first : [0, K + 1, K + 2, 1, K + 3, 2, K + 4, 3, ..., K + (K + 1), K] // mode = depth_first : [0, 1, K + 2, 2, K + 3, 3, K + 4, 4, ..., K + (K + 1), K + 1] // where `k` is number of spatial dimensions - auto reshapeAndSetPermOrder = [&](const size_t idx1, const size_t idx2, const size_t shift, const VectorDims& dims) { - for (size_t i = 0; i < attrs.nSpatialDims; i++) { - params.order[i * 2 + shift] = i + idx1; - params.order[i * 2 + shift + 1] = i + idx2; + auto reshapeAndSetPermOrder = + [&](const size_t idx1, const size_t idx2, const size_t shift, const VectorDims& dims) { + for (size_t i = 0; i < attrs.nSpatialDims; i++) { + params.order[i * 2 + shift] = i + idx1; + params.order[i * 2 + shift + 1] = i + idx2; - params.src_block_dims[params.order[i * 2 + shift]] = dims[i + shift]; - params.src_block_dims[params.order[i * 2 + shift + 1]] = attrs.blockSize; - } - }; + params.src_block_dims[params.order[i * 2 + shift]] = dims[i + shift]; + params.src_block_dims[params.order[i * 2 + shift + 1]] = attrs.blockSize; + } + }; if (isBlocked) { size_t orderShiftForBlocks, orderShiftForDims; @@ -314,6 +320,6 @@ bool DepthToSpace::created() const { return getType() == Type::DepthToSpace; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/depth_to_space.h b/src/plugins/intel_cpu/src/nodes/depth_to_space.h index 2eda39f60394af..21eca73f97318c 100644 --- a/src/plugins/intel_cpu/src/nodes/depth_to_space.h +++ b/src/plugins/intel_cpu/src/nodes/depth_to_space.h @@ -54,6 +54,6 @@ class DepthToSpace : public Node { executorPtr execPtr = nullptr; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/detection_output.cpp b/src/plugins/intel_cpu/src/nodes/detection_output.cpp index 99702780b83034..9cf52e7042c6ba 100644 --- a/src/plugins/intel_cpu/src/nodes/detection_output.cpp +++ b/src/plugins/intel_cpu/src/nodes/detection_output.cpp @@ -16,8 +16,7 @@ namespace node { namespace { template -bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { +bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2) { return (pair1.first > pair2.first) || (pair1.first == pair2.first && pair1.second < pair2.second); } @@ -27,9 +26,10 @@ bool SortScorePairDescend>(const std::pair pair2.first) || (pair1.first == pair2.first && pair1.second.second < pair2.second.second); } -} // namespace +} // namespace -bool DetectionOutput::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool DetectionOutput::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto doOp = ov::as_type_ptr(op); if (!doOp) { @@ -58,7 +58,7 @@ DetectionOutput::DetectionOutput(const std::shared_ptr& op, const Grap errorPrefix = "DetectionOutput node with name '" + getName() + "' "; if (getOriginalInputsNumber() != 3 && getOriginalInputsNumber() != 5) - OPENVINO_THROW(errorPrefix, "has incorrect number of input edges."); + OPENVINO_THROW(errorPrefix, "has incorrect number of input edges."); if (getOriginalOutputsNumber() != 1) OPENVINO_THROW(errorPrefix, "has incorrect number of output edges."); @@ -93,7 +93,7 @@ DetectionOutput::DetectionOutput(const std::shared_ptr& op, const Grap void DetectionOutput::prepareParams() { const auto& idPriorDims = getParentEdgeAt(ID_PRIOR)->getMemory().getShape().getStaticDims(); - const auto &idConfDims = getParentEdgeAt(ID_CONF)->getMemory().getShape().getStaticDims(); + const auto& idConfDims = getParentEdgeAt(ID_CONF)->getMemory().getShape().getStaticDims(); priorsNum = static_cast(idPriorDims.back() / priorSize); isPriorsPerImg = idPriorDims.front() != 1; classesNum = static_cast(idConfDims.back() / priorsNum); @@ -130,9 +130,8 @@ void DetectionOutput::prepareParams() { // --> g_topk(vector<>(all detections) --> indices per class)) // MXNet: max conf for prior within img, filter(indices) --> topk_img(buffer) --> nms_cls(indices) // --> g_topk(vector<>(all detections) --> indices per class)) - isSparsityWorthwhile = - (confidenceThreshold > sparsityThreshold) && - ((classesNum * priorsNum * sizeof(float) * 2) > static_cast(cacheSizeL3)); + isSparsityWorthwhile = (confidenceThreshold > sparsityThreshold) && + ((classesNum * priorsNum * sizeof(float) * 2) > static_cast(cacheSizeL3)); confInfoLen = (!decreaseClassId && isSparsityWorthwhile) ? (2 * priorsNum + 1) : priorsNum; reorderedConf.resize(imgNum * classesNum * confInfoLen); @@ -149,17 +148,17 @@ void DetectionOutput::initSupportedPrimitiveDescriptors() { for (size_t i = 0; i < inputShapes.size(); ++i) inDataConf.emplace_back(LayoutType::ncsp, ov::element::f32); - addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::f32}}, - impl_desc_type::ref_any); + addSupportedPrimDesc(inDataConf, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } struct ConfidenceComparatorDO { explicit ConfidenceComparatorDO(const float* confDataIn) : confData(confDataIn) {} bool operator()(int idx1, int idx2) { - if (confData[idx1] > confData[idx2]) return true; - if (confData[idx1] < confData[idx2]) return false; + if (confData[idx1] > confData[idx2]) + return true; + if (confData[idx1] < confData[idx2]) + return false; return idx1 < idx2; } @@ -171,31 +170,29 @@ void DetectionOutput::executeDynamicImpl(dnnl::stream strm) { } void DetectionOutput::execute(dnnl::stream strm) { - float *dstData = getDstDataAtPortAs(0); + float* dstData = getDstDataAtPortAs(0); - const float *locData = getSrcDataAtPortAs(ID_LOC); - const float *confData = getSrcDataAtPortAs(ID_CONF); - const float *priorData = getSrcDataAtPortAs(ID_PRIOR); - const float *ARMConfData = inputShapes.size() > 3 ? - getSrcDataAtPortAs(ID_ARM_CONF) : nullptr; - const float *ARMLocData = inputShapes.size() > 4 ? - getSrcDataAtPortAs(ID_ARM_LOC) : nullptr; + const float* locData = getSrcDataAtPortAs(ID_LOC); + const float* confData = getSrcDataAtPortAs(ID_CONF); + const float* priorData = getSrcDataAtPortAs(ID_PRIOR); + const float* ARMConfData = inputShapes.size() > 3 ? getSrcDataAtPortAs(ID_ARM_CONF) : nullptr; + const float* ARMLocData = inputShapes.size() > 4 ? getSrcDataAtPortAs(ID_ARM_LOC) : nullptr; - float *reorderedConfData = reorderedConf.data(); - int *reorderedConfDataIndices = reinterpret_cast(reorderedConf.data()); + float* reorderedConfData = reorderedConf.data(); + int* reorderedConfDataIndices = reinterpret_cast(reorderedConf.data()); - float *decodedBboxesData = decodedBboxes.data(); - float *bboxSizesData = bboxSizes.data(); - int *indicesData = indices.data(); - int *indicesBufData = indicesBuffer.data(); - int *detectionsData = detectionsCount.data(); + float* decodedBboxesData = decodedBboxes.data(); + float* bboxSizesData = bboxSizes.data(); + int* indicesData = indices.data(); + int* indicesBufData = indicesBuffer.data(); + int* detectionsData = detectionsCount.data(); memset(detectionsData, 0, imgNum * classesNum * sizeof(int)); int priorsBatch = isPriorsPerImg ? imgNum : 1; - int *numPriorsActualdata = numPriorsActual.data(); + int* numPriorsActualdata = numPriorsActual.data(); for (int n = 0; n < priorsBatch; ++n) { - const float *ppriors = priorData; + const float* ppriors = priorData; ppriors += varianceEncodedInTarget ? (n * priorsNum * priorSize) : (2 * n * priorsNum * priorSize); getActualPriorNum(ppriors, numPriorsActualdata, n); } @@ -204,21 +201,32 @@ void DetectionOutput::execute(dnnl::stream strm) { if (!isSparsityWorthwhile) { confReorderDense(confData, ARMConfData, reorderedConfData); - } else { // sparsity + } else { // sparsity if (!decreaseClassId) { - confReorderAndFilterSparsityCF(confData, ARMConfData, reorderedConfData, indicesData, indicesBufData, detectionsData); + confReorderAndFilterSparsityCF(confData, + ARMConfData, + reorderedConfData, + indicesData, + indicesBufData, + detectionsData); } else { - confReorderAndFilterSparsityMX(confData, ARMConfData, reorderedConfData, indicesData, indicesBufData, detectionsData); + confReorderAndFilterSparsityMX(confData, + ARMConfData, + reorderedConfData, + indicesData, + indicesBufData, + detectionsData); } } - int *confInfoV = confInfoForPrior.data(); + int* confInfoV = confInfoForPrior.data(); for (int n = 0; n < imgNum; ++n) { - const float *ppriors = priorData; - const float *priorVariances = priorData + priorsNum * priorSize; + const float* ppriors = priorData; + const float* priorVariances = priorData + priorsNum * priorSize; if (isPriorsPerImg) { - int priorSizePerImg = varianceEncodedInTarget ? (n * priorsNum * priorSize) : (2 * n * priorsNum * priorSize); + int priorSizePerImg = + varianceEncodedInTarget ? (n * priorsNum * priorSize) : (2 * n * priorsNum * priorSize); ppriors += priorSizePerImg; priorVariances += varianceEncodedInTarget ? 0 : priorSizePerImg; } @@ -226,17 +234,50 @@ void DetectionOutput::execute(dnnl::stream strm) { if (isShareLoc) { int locShift = n * priorsNum; int coordShift = locShift * 4; - const float *ploc = locData + coordShift; - float *pboxes = decodedBboxesData + coordShift; - float *psizes = bboxSizesData + locShift; - int *confInfoVB = confInfoV + locShift; + const float* ploc = locData + coordShift; + float* pboxes = decodedBboxesData + coordShift; + float* psizes = bboxSizesData + locShift; + int* confInfoVB = confInfoV + locShift; if (withAddBoxPred) { - const float *pARMLoc = ARMLocData + coordShift; - decodeBBoxes(ppriors, pARMLoc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, nullptr, confInfoVB); - decodeBBoxes(pboxes, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, 0, 4, false, nullptr, confInfoVB); + const float* pARMLoc = ARMLocData + coordShift; + decodeBBoxes(ppriors, + pARMLoc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + nullptr, + confInfoVB); + decodeBBoxes(pboxes, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + 0, + 4, + false, + nullptr, + confInfoVB); } else { - decodeBBoxes(ppriors, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, nullptr, confInfoVB); + decodeBBoxes(ppriors, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + nullptr, + confInfoVB); } } else { for (int c = 0; c < locNumForClasses; ++c) { @@ -245,16 +286,46 @@ void DetectionOutput::execute(dnnl::stream strm) { } int locShift = n * priorsNum * locNumForClasses; int coordShift = locShift * 4; - const float *ploc = locData + coordShift + c * 4; - float *pboxes = decodedBboxesData + coordShift + c * 4 * priorsNum; - float *psizes = bboxSizesData + locShift + c * priorsNum; - int *confInfoHBC = reorderedConfDataIndices + n * confInfoLen * classesNum + c*confInfoLen; + const float* ploc = locData + coordShift + c * 4; + float* pboxes = decodedBboxesData + coordShift + c * 4 * priorsNum; + float* psizes = bboxSizesData + locShift + c * priorsNum; + int* confInfoHBC = reorderedConfDataIndices + n * confInfoLen * classesNum + c * confInfoLen; if (withAddBoxPred) { - const float *pARMLoc = ARMLocData + n * 4 * locNumForClasses * priorsNum + c * 4; - decodeBBoxes(ppriors, pARMLoc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, confInfoHBC); - decodeBBoxes(pboxes, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, 0, 4, false, confInfoHBC); + const float* pARMLoc = ARMLocData + n * 4 * locNumForClasses * priorsNum + c * 4; + decodeBBoxes(ppriors, + pARMLoc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + confInfoHBC); + decodeBBoxes(pboxes, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + 0, + 4, + false, + confInfoHBC); } else { - decodeBBoxes(ppriors, ploc, priorVariances, pboxes, psizes, numPriorsActualdata, n, coordOffset, priorSize, true, confInfoHBC); + decodeBBoxes(ppriors, + ploc, + priorVariances, + pboxes, + psizes, + numPriorsActualdata, + n, + coordOffset, + priorSize, + true, + confInfoHBC); } } } @@ -267,16 +338,16 @@ void DetectionOutput::execute(dnnl::stream strm) { parallel_for(classesNum, [&](int c) { if (c != backgroundClassId) { // Ignore background class const int off = n * priorsNum * classesNum + c * priorsNum; - const float *pconfReorder = reorderedConfData + off; - int *pindices = indicesData + off; - int *pbuffer = indicesBufData + off; - int *pdetections = detectionsData + n * classesNum + c; + const float* pconfReorder = reorderedConfData + off; + int* pindices = indicesData + off; + int* pbuffer = indicesBufData + off; + int* pdetections = detectionsData + n * classesNum + c; if (!isSparsityWorthwhile) confFilterCF(pconfReorder, pindices, pbuffer, pdetections, n); - const float *pboxes; - const float *psizes; + const float* pboxes; + const float* psizes; if (isShareLoc) { pboxes = decodedBboxesData + n * 4 * priorsNum; psizes = bboxSizesData + n * priorsNum; @@ -291,23 +362,23 @@ void DetectionOutput::execute(dnnl::stream strm) { } else { // MXNet style const int offImg = n * priorsNum * classesNum; - const float *pconf = confData + offImg; - float *pconfReorder = reorderedConfData + offImg; - int *pbuffer = indicesBufData + offImg; - int *pindices = indicesData + offImg; - int *pdetections = detectionsData + n * classesNum; + const float* pconf = confData + offImg; + float* pconfReorder = reorderedConfData + offImg; + int* pbuffer = indicesBufData + offImg; + int* pindices = indicesData + offImg; + int* pdetections = detectionsData + n * classesNum; if (!isSparsityWorthwhile) confFilterMX(pconf, ARMConfData, pconfReorder, pindices, pbuffer, pdetections, n); - const float *pboxes = decodedBboxesData + n * 4 * locNumForClasses * priorsNum; - const float *psizes = bboxSizesData + n * locNumForClasses * priorsNum; + const float* pboxes = decodedBboxesData + n * 4 * locNumForClasses * priorsNum; + const float* psizes = bboxSizesData + n * locNumForClasses * priorsNum; NMSMX(pbuffer, pdetections, pindices, pboxes, psizes); } int detectionsTotal = 0; - detectionsTotal = parallel_sum(classesNum, detectionsTotal, [&](size_t c)->int { + detectionsTotal = parallel_sum(classesNum, detectionsTotal, [&](size_t c) -> int { return detectionsData[n * classesNum + c]; }); @@ -318,9 +389,9 @@ void DetectionOutput::execute(dnnl::stream strm) { std::mutex mtx; parallel_for(classesNum, [&](int c) { const int detections = detectionsData[n * classesNum + c]; - int *pindices = indicesData + n * classesNum * priorsNum + c * priorsNum; + int* pindices = indicesData + n * classesNum * priorsNum + c * priorsNum; - float *pconf = reorderedConfData + n * classesNum * confInfoLen + c * confInfoLen; + float* pconf = reorderedConfData + n * classesNum * confInfoLen + c * confInfoLen; for (int i = 0; i < detections; ++i) { int pr = pindices[i]; @@ -330,7 +401,8 @@ void DetectionOutput::execute(dnnl::stream strm) { } }); - std::sort(confIndicesClassMap.begin(), confIndicesClassMap.end(), + std::sort(confIndicesClassMap.begin(), + confIndicesClassMap.end(), SortScorePairDescend>); confIndicesClassMap.resize(keepTopK); @@ -340,7 +412,7 @@ void DetectionOutput::execute(dnnl::stream strm) { for (size_t j = 0; j < confIndicesClassMap.size(); ++j) { const int cls = confIndicesClassMap[j].second.first; const int pr = confIndicesClassMap[j].second.second; - int *pindices = indicesData + n * classesNum * priorsNum + cls * priorsNum; + int* pindices = indicesData + n * classesNum * priorsNum + cls * priorsNum; pindices[detectionsData[n * classesNum + cls]] = pr; detectionsData[n * classesNum + cls]++; } @@ -351,7 +423,11 @@ void DetectionOutput::execute(dnnl::stream strm) { generateOutput(reorderedConfData, indicesData, detectionsData, decodedBboxesData, dstData); } -inline void DetectionOutput::confFilterCF(const float* pconf, int* pindices, int* pbuffer, int* detectionsData, const int& n) { +inline void DetectionOutput::confFilterCF(const float* pconf, + int* pindices, + int* pbuffer, + int* detectionsData, + const int& n) { // in: reorderedConf // out: pindices count int count = 0; @@ -371,21 +447,27 @@ inline void DetectionOutput::confFilterCF(const float* pconf, int* pindices, int // MX filter is per image filter, max output is prior num(select max for all class within this prior) // NMS is per class, keep topk is per image, final output is per class -inline void DetectionOutput::confFilterMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData, const int& n) { +inline void DetectionOutput::confFilterMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData, + const int& n) { std::mutex mtx; parallel_for(numPriorsActual[n], [&](size_t p) { // in: origin conf // out: pindices, detectionCount // intentionally code branch from higher level if (withAddBoxPred) { - const bool isARMPrior = ARMConfData[n*priorsNum*2 + p * 2 + 1] < objScore; + const bool isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore; float maxConf = -1; int maxCIdx = 0; for (int c = 1; c < classesNum; ++c) { float conf = confData[p * classesNum + c]; if (isARMPrior) - conf = (c == backgroundClassId) ? 1.0f : 0.0f; // still need refresh conf due to read from origin conf + conf = + (c == backgroundClassId) ? 1.0f : 0.0f; // still need refresh conf due to read from origin conf if (conf >= confidenceThreshold && conf > maxConf) { maxConf = conf; maxCIdx = c; @@ -394,7 +476,7 @@ inline void DetectionOutput::confFilterMX(const float* confData, const float* AR if (maxCIdx > 0) { // include this prior mtx.lock(); - indicesData[detectionsData[0]] = maxCIdx*priorsNum + p; // de-refer to get prior and class id. + indicesData[detectionsData[0]] = maxCIdx * priorsNum + p; // de-refer to get prior and class id. detectionsData[0]++; mtx.unlock(); } @@ -411,7 +493,7 @@ inline void DetectionOutput::confFilterMX(const float* confData, const float* AR if (maxCIdx > 0) { // include this prior and class with max conf mtx.lock(); - indicesData[detectionsData[0]] = maxCIdx*priorsNum + p; // de-refer to get prior and class id. + indicesData[detectionsData[0]] = maxCIdx * priorsNum + p; // de-refer to get prior and class id. detectionsData[0]++; mtx.unlock(); } @@ -423,14 +505,14 @@ inline void DetectionOutput::confFilterMX(const float* confData, const float* AR int count = detectionsData[0]; int k = (topK == -1 ? count : (std::min)(topK, count)); - const float *pconf = reorderedConfData; + const float* pconf = reorderedConfData; // int *indices = indicesData; // int *pbuffer = indicesBufData; topk(indicesData, indicesBufData, pconf, count, k); detectionsData[0] = k; } -inline void DetectionOutput::getActualPriorNum(const float *priorData, int* numPriorsActual, int n) { +inline void DetectionOutput::getActualPriorNum(const float* priorData, int* numPriorsActual, int n) { numPriorsActual[n] = priorsNum; if (!normalized) { int num = 0; @@ -444,16 +526,20 @@ inline void DetectionOutput::getActualPriorNum(const float *priorData, int* numP } } -inline void DetectionOutput::confReorderDense(const float *confData, const float *ARMConfData, float *reorderedConfData) { +inline void DetectionOutput::confReorderDense(const float* confData, + const float* ARMConfData, + float* reorderedConfData) { if (withAddBoxPred) { parallel_for2d(imgNum, priorsNum, [&](size_t n, size_t p) { if (ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore) { for (int c = 0; c < classesNum; ++c) { - reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = c == backgroundClassId ? 1.0f : 0.0f; + reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = + c == backgroundClassId ? 1.0f : 0.0f; } } else { for (int c = 0; c < classesNum; ++c) { - reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = confData[n * priorsNum * classesNum + p * classesNum + c]; + reorderedConfData[n * priorsNum * classesNum + c * priorsNum + p] = + confData[n * priorsNum * classesNum + p * classesNum + c]; } } }); @@ -463,20 +549,23 @@ inline void DetectionOutput::confReorderDense(const float *confData, const float parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) { const int offset = n * priorsNum * classesNum; for (int p = 0; p < priorsNum; ++p) { - reorderedConfData[offset + c * priorsNum + p] = - confData[offset + p * classesNum + c]; + reorderedConfData[offset + c * priorsNum + p] = confData[offset + p * classesNum + c]; } }); } -inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData) { +inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData) { int* reorderedConfDataIndices = reinterpret_cast(reorderedConfData); for (int n = 0; n < imgNum; ++n) { const int off = n * priorsNum * classesNum; const int offV = n * priorsNum; // vertical info - const int offH = n * confInfoLen * classesNum; // horizontal info + const int offH = n * confInfoLen * classesNum; // horizontal info // reset count parallel_for(classesNum, [&](size_t c) { const int countIdx = offH + c * confInfoLen + priorsNum; @@ -506,7 +595,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat // vertical info for isShareLoc(flag to decode for each prior) if (!priorStatusSet && isShareLoc) { - confInfoForPrior[offV + p] = 1; // 1 for decode + confInfoForPrior[offV + p] = 1; // 1 for decode } } } @@ -542,9 +631,9 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat const int count = reorderedConfDataIndices[countIdx]; const int k = (topK == -1 ? count : (std::min)(topK, count)); - int *reorderedConfIndices = reorderedConfDataIndices + countIdx + 1; - int *pbuffer = indicesBufData + off + c * priorsNum; - const float *pconf = reorderedConfData + offH + c * confInfoLen; + int* reorderedConfIndices = reorderedConfDataIndices + countIdx + 1; + int* pbuffer = indicesBufData + off + c * priorsNum; + const float* pconf = reorderedConfData + offH + c * confInfoLen; topk(reorderedConfIndices, pbuffer, pconf, count, k); detectionsData[n * classesNum + c] = k; @@ -552,8 +641,12 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat } } -inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData) { +inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData) { for (int n = 0; n < imgNum; ++n) { const int off = n * priorsNum * classesNum; const int offV = n * priorsNum; // vertical info @@ -579,7 +672,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat // vertical info for isShareLoc(flag to decode for each prior) if (!priorStatusSet && isShareLoc) { - confInfoForPrior[offV + p] = 1; // 1 for decode + confInfoForPrior[offV + p] = 1; // 1 for decode } // vertical info for MXNet style(max conf for each prior) if (c != 0) { @@ -593,7 +686,8 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat // MXNet statistic, indices and detectionCount is for each image if (maxCIdx > 0) { mtx.lock(); - indicesData[off + detectionsData[n * classesNum]] = maxCIdx * priorsNum + p; // de-refer to get prior and class id. + indicesData[off + detectionsData[n * classesNum]] = + maxCIdx * priorsNum + p; // de-refer to get prior and class id. detectionsData[n * classesNum]++; mtx.unlock(); } @@ -604,27 +698,27 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat const int count = detectionsData[n * classesNum]; const int k = (topK == -1 ? count : (std::min)(topK, count)); - const float *pconf = reorderedConfData + off; - int *indices = indicesData + off; - int *pbuffer = indicesBufData + off; + const float* pconf = reorderedConfData + off; + int* indices = indicesData + off; + int* pbuffer = indicesBufData + off; topk(indices, pbuffer, pconf, count, k); detectionsData[n * classesNum] = k; } } // apply locData(offset) to priordata, generate decodedBox -inline void DetectionOutput::decodeBBoxes(const float *priorData, - const float *locData, - const float *varianceData, - float *decodedBboxes, - float *decodedBboxSizes, - int* numPriorsActual, - int n, - const int& offs, - const int& priorSize, - bool decodeType, - const int *confInfoH, - const int *confInfoV) { +inline void DetectionOutput::decodeBBoxes(const float* priorData, + const float* locData, + const float* varianceData, + float* decodedBboxes, + float* decodedBboxSizes, + int* numPriorsActual, + int n, + const int& offs, + const int& priorSize, + bool decodeType, + const int* confInfoH, + const int* confInfoV) { int prNum = numPriorsActual[n]; if (!decodeType) { prNum = priorsNum; @@ -672,8 +766,8 @@ inline void DetectionOutput::decodeBBoxes(const float *priorData, newYMax = priorYMax + varianceData[p * 4 + 3] * locYMax; } } else if (codeType == CodeType::CENTER_SIZE) { - float priorWidth = priorXMax - priorXMin; - float priorHeight = priorYMax - priorYMin; + float priorWidth = priorXMax - priorXMin; + float priorHeight = priorYMax - priorYMin; float priorCenterX = (priorXMin + priorXMax) / 2.0f; float priorCenterY = (priorYMin + priorYMax) / 2.0f; @@ -682,21 +776,21 @@ inline void DetectionOutput::decodeBBoxes(const float *priorData, if (varianceEncodedInTarget) { // variance is encoded in target, we simply need to restore the offset predictions. - decodeBboxCenterX = locXMin * priorWidth + priorCenterX; + decodeBboxCenterX = locXMin * priorWidth + priorCenterX; decodeBboxCenterY = locYMin * priorHeight + priorCenterY; - decodeBboxWidth = std::exp(locXMax) * priorWidth; + decodeBboxWidth = std::exp(locXMax) * priorWidth; decodeBboxHeight = std::exp(locYMax) * priorHeight; } else { // variance is encoded in bbox, we need to scale the offset accordingly. - decodeBboxCenterX = varianceData[p*4 + 0] * locXMin * priorWidth + priorCenterX; - decodeBboxCenterY = varianceData[p*4 + 1] * locYMin * priorHeight + priorCenterY; - decodeBboxWidth = std::exp(varianceData[p*4 + 2] * locXMax) * priorWidth; - decodeBboxHeight = std::exp(varianceData[p*4 + 3] * locYMax) * priorHeight; + decodeBboxCenterX = varianceData[p * 4 + 0] * locXMin * priorWidth + priorCenterX; + decodeBboxCenterY = varianceData[p * 4 + 1] * locYMin * priorHeight + priorCenterY; + decodeBboxWidth = std::exp(varianceData[p * 4 + 2] * locXMax) * priorWidth; + decodeBboxHeight = std::exp(varianceData[p * 4 + 3] * locYMax) * priorHeight; } - newXMin = decodeBboxCenterX - decodeBboxWidth / 2.0f; + newXMin = decodeBboxCenterX - decodeBboxWidth / 2.0f; newYMin = decodeBboxCenterY - decodeBboxHeight / 2.0f; - newXMax = decodeBboxCenterX + decodeBboxWidth / 2.0f; + newXMax = decodeBboxCenterX + decodeBboxWidth / 2.0f; newYMax = decodeBboxCenterY + decodeBboxHeight / 2.0f; } @@ -707,25 +801,20 @@ inline void DetectionOutput::decodeBBoxes(const float *priorData, newYMax = (std::max)(0.0f, (std::min)(1.0f, newYMax)); } - decodedBboxes[p*4 + 0] = newXMin; - decodedBboxes[p*4 + 1] = newYMin; - decodedBboxes[p*4 + 2] = newXMax; - decodedBboxes[p*4 + 3] = newYMax; + decodedBboxes[p * 4 + 0] = newXMin; + decodedBboxes[p * 4 + 1] = newYMin; + decodedBboxes[p * 4 + 2] = newXMax; + decodedBboxes[p * 4 + 3] = newYMax; decodedBboxSizes[p] = (newXMax - newXMin) * (newYMax - newYMin); }); } -inline void DetectionOutput::topk(const int *indicesIn, int *indicesOut, const float *conf, int n, int k) { - std::partial_sort_copy(indicesIn, indicesIn + n, - indicesOut, indicesOut + k, - ConfidenceComparatorDO(conf)); +inline void DetectionOutput::topk(const int* indicesIn, int* indicesOut, const float* conf, int n, int k) { + std::partial_sort_copy(indicesIn, indicesIn + n, indicesOut, indicesOut + k, ConfidenceComparatorDO(conf)); } -static inline float JaccardOverlap(const float *decodedBbox, - const float *bboxSizes, - const int idx1, - const int idx2) { +static inline float JaccardOverlap(const float* decodedBbox, const float* bboxSizes, const int idx1, const int idx2) { const float xmin1 = decodedBbox[idx1 * 4 + 0]; const float ymin1 = decodedBbox[idx1 * 4 + 1]; const float xmax1 = decodedBbox[idx1 * 4 + 2]; @@ -745,7 +834,7 @@ static inline float JaccardOverlap(const float *decodedBbox, float intersectXMax = (std::min)(xmax1, xmax2); float intersectYMax = (std::min)(ymax1, ymax2); - float intersectWidth = intersectXMax - intersectXMin; + float intersectWidth = intersectXMax - intersectXMin; float intersectHeight = intersectYMax - intersectYMin; if (intersectWidth <= 0 || intersectHeight <= 0) { @@ -760,10 +849,10 @@ static inline float JaccardOverlap(const float *decodedBbox, } inline void DetectionOutput::NMSCF(int* indicesIn, - int& detections, - int* indicesOut, - const float* bboxes, - const float* boxSizes) { + int& detections, + int* indicesOut, + const float* bboxes, + const float* boxSizes) { // nms for this class int countIn = detections; detections = 0; @@ -787,10 +876,10 @@ inline void DetectionOutput::NMSCF(int* indicesIn, } inline void DetectionOutput::NMSMX(int* indicesIn, - int* detections, - int* indicesOut, - const float* bboxes, - const float* sizes) { + int* detections, + int* indicesOut, + const float* bboxes, + const float* sizes) { // Input is candidate for image, output is candidate for each class within image int countIn = detections[0]; detections[0] = 0; @@ -801,8 +890,8 @@ inline void DetectionOutput::NMSMX(int* indicesIn, const int prior = idx % priorsNum; // nms within this class - int &ndetection = detections[cls]; - int *pindices = indicesOut + cls * priorsNum; + int& ndetection = detections[cls]; + int* pindices = indicesOut + cls * priorsNum; bool keep = true; for (int k = 0; k < ndetection; ++k) { @@ -825,8 +914,11 @@ inline void DetectionOutput::NMSMX(int* indicesIn, } } -inline void DetectionOutput::generateOutput(float* reorderedConfData, int* indicesData, int* detectionsData, float* decodedBboxesData, - float* dstData) { +inline void DetectionOutput::generateOutput(float* reorderedConfData, + int* indicesData, + int* detectionsData, + float* decodedBboxesData, + float* dstData) { const auto& outDims = getChildEdgeAt(0)->getMemory().getStaticDims(); const int numResults = outDims[2]; const int DETECTION_SIZE = outDims[3]; @@ -850,26 +942,22 @@ inline void DetectionOutput::generateOutput(float* reorderedConfData, int* indic // set final detection result to output blob int count = 0; for (int n = 0; n < imgNum; ++n) { - const float *pconf = reorderedConfData + n * confInfoLen * classesNum; - const float *pboxes = decodedBboxesData + n * priorsNum * 4 * locNumForClasses; - const int *pindices = indicesData + n * classesNum * priorsNum; + const float* pconf = reorderedConfData + n * confInfoLen * classesNum; + const float* pboxes = decodedBboxesData + n * priorsNum * 4 * locNumForClasses; + const int* pindices = indicesData + n * classesNum * priorsNum; for (int c = 0; c < classesNum; ++c) { for (int i = 0; i < detectionsData[n * classesNum + c]; ++i) { int prIdx = pindices[c * priorsNum + i]; dstData[count * DETECTION_SIZE + 0] = static_cast(n); - dstData[count * DETECTION_SIZE + 1] = static_cast(decreaseClassId ? c-1 : c); + dstData[count * DETECTION_SIZE + 1] = static_cast(decreaseClassId ? c - 1 : c); dstData[count * DETECTION_SIZE + 2] = pconf[c * confInfoLen + prIdx]; - float xmin = isShareLoc ? pboxes[prIdx * 4 + 0] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 0]; - float ymin = isShareLoc ? pboxes[prIdx * 4 + 1] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 1]; - float xmax = isShareLoc ? pboxes[prIdx * 4 + 2] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 2]; - float ymax = isShareLoc ? pboxes[prIdx * 4 + 3] : - pboxes[c * 4 * priorsNum + prIdx * 4 + 3]; + float xmin = isShareLoc ? pboxes[prIdx * 4 + 0] : pboxes[c * 4 * priorsNum + prIdx * 4 + 0]; + float ymin = isShareLoc ? pboxes[prIdx * 4 + 1] : pboxes[c * 4 * priorsNum + prIdx * 4 + 1]; + float xmax = isShareLoc ? pboxes[prIdx * 4 + 2] : pboxes[c * 4 * priorsNum + prIdx * 4 + 2]; + float ymax = isShareLoc ? pboxes[prIdx * 4 + 3] : pboxes[c * 4 * priorsNum + prIdx * 4 + 3]; if (clipAfterNMS) { xmin = (std::max)(0.0f, (std::min)(1.0f, xmin)); @@ -898,6 +986,6 @@ bool DetectionOutput::created() const { return getType() == Type::DetectionOutput; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/detection_output.h b/src/plugins/intel_cpu/src/nodes/detection_output.h index 418898f011f313..1a42bfa9b2980a 100644 --- a/src/plugins/intel_cpu/src/nodes/detection_output.h +++ b/src/plugins/intel_cpu/src/nodes/detection_output.h @@ -15,7 +15,7 @@ class DetectionOutput : public Node { public: DetectionOutput(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -47,8 +47,8 @@ class DetectionOutput : public Node { float sparsityThreshold = 0.03f; int topK = 0; float NMSThreshold = 0.0f; - bool clipBeforeNMS = false; - bool clipAfterNMS = false; + bool clipBeforeNMS = false; + bool clipAfterNMS = false; int backgroundClassId = 0; bool decreaseClassId = false; int keepTopK = 0; @@ -75,28 +75,52 @@ class DetectionOutput : public Node { inline void confFilterCF(const float* pConf, int* pindices, int* pbuffer, int* detectionsData, const int& n); - inline void confFilterMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData, const int& n); - - inline void confReorderAndFilterSparsityCF(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData); - - inline void confReorderAndFilterSparsityMX(const float* confData, const float* ARMConfData, float* reorderedConfData, - int* indicesData, int* indicesBufData, int* detectionsData); - - inline void decodeBBoxes(const float* prior_data, const float* loc_data, const float* variance_data, - float* decoded_bboxes, float* decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size, - bool decodeType = true, const int* conf_info_h = nullptr, const int* conf_info_v = nullptr); // decodeType is false after ARM - - inline void NMSCF(int* indicesIn, int& detections, int* indicesOut, - const float* bboxes, const float* boxSizes); - - inline void NMSMX(int* indicesIn, int* detections, int* indicesOut, - const float* bboxes, const float* sizes); + inline void confFilterMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData, + const int& n); + + inline void confReorderAndFilterSparsityCF(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData); + + inline void confReorderAndFilterSparsityMX(const float* confData, + const float* ARMConfData, + float* reorderedConfData, + int* indicesData, + int* indicesBufData, + int* detectionsData); + + inline void decodeBBoxes(const float* prior_data, + const float* loc_data, + const float* variance_data, + float* decoded_bboxes, + float* decoded_bbox_sizes, + int* num_priors_actual, + int n, + const int& offs, + const int& pr_size, + bool decodeType = true, + const int* conf_info_h = nullptr, + const int* conf_info_v = nullptr); // decodeType is false after ARM + + inline void NMSCF(int* indicesIn, int& detections, int* indicesOut, const float* bboxes, const float* boxSizes); + + inline void NMSMX(int* indicesIn, int* detections, int* indicesOut, const float* bboxes, const float* sizes); inline void topk(const int* indicesIn, int* indicesOut, const float* conf, int n, int k); - inline void generateOutput(float* reorderedConfData, int* indicesData, int* detectionsData, float* decodedBboxesData, float* dstData); + inline void generateOutput(float* reorderedConfData, + int* indicesData, + int* detectionsData, + float* decodedBboxesData, + float* dstData); std::vector decodedBboxes; std::vector indicesBuffer; @@ -110,6 +134,6 @@ class DetectionOutput : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/dft.cpp b/src/plugins/intel_cpu/src/nodes/dft.cpp index 76ecbbb36617f5..5fa8053d7024d7 100644 --- a/src/plugins/intel_cpu/src/nodes/dft.cpp +++ b/src/plugins/intel_cpu/src/nodes/dft.cpp @@ -4,17 +4,17 @@ #include "dft.h" +#include +#include #include #include -#include -#include "dnnl_extension_utils.h" -#include "openvino/core/parallel.hpp" +#include "common/cpu_memcpy.h" +#include "dnnl_extension_utils.h" #include "onednn/dnnl.h" +#include "openvino/core/parallel.hpp" #include "utils/general_utils.h" -#include "common/cpu_memcpy.h" #include "utils/ngraph_utils.hpp" -#include using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; @@ -104,10 +104,10 @@ void DFT::initSupportedPrimitiveDescriptors() { } } - std::vector inDataConfigurators({{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::i32}}); + std::vector inDataConfigurators( + {{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::i32}}); if (inputShapes.size() > SIGNAL_SIZE_INDEX) - inDataConfigurators.push_back({LayoutType::ncsp, ov::element::i32}); + inDataConfigurators.push_back({LayoutType::ncsp, ov::element::i32}); addSupportedPrimDesc(inDataConfigurators, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -172,8 +172,12 @@ size_t calculateOffsetFromStrides(const std::vector& coords, const std:: return offset; } -void gatherToBufferND(float* buffer, const float* data, size_t axis, const std::vector& dimIndexes, - const std::vector& shape, const std::vector& strides) { +void gatherToBufferND(float* buffer, + const float* data, + size_t axis, + const std::vector& dimIndexes, + const std::vector& shape, + const std::vector& strides) { size_t numberOfComplex = shape[axis]; size_t offset = calculateOffsetFromStrides(dimIndexes, strides); @@ -184,8 +188,12 @@ void gatherToBufferND(float* buffer, const float* data, size_t axis, const std:: } } -void applyBufferND(const float* buffer, float* output, size_t axis, const std::vector& dimIndexes, - const std::vector& shape, const std::vector& strides) { +void applyBufferND(const float* buffer, + float* output, + size_t axis, + const std::vector& dimIndexes, + const std::vector& shape, + const std::vector& strides) { size_t numberOfComplex = shape[axis]; size_t offset = calculateOffsetFromStrides(dimIndexes, strides); @@ -196,8 +204,12 @@ void applyBufferND(const float* buffer, float* output, size_t axis, const std::v } } -void copyDataToOutputWithSignalSize(const float* input, const std::vector& inputShape, const std::vector& inputStrides, - float* output, const std::vector& outputShape, const std::vector& outputStrides) { +void copyDataToOutputWithSignalSize(const float* input, + const std::vector& inputShape, + const std::vector& inputStrides, + float* output, + const std::vector& outputShape, + const std::vector& outputStrides) { auto totalInput = std::accumulate(inputShape.begin(), inputShape.end(), size_t(1), std::multiplies()); auto totalOutput = std::accumulate(outputShape.begin(), outputShape.end(), size_t(1), std::multiplies()); std::fill_n(output, totalOutput, 0.f); @@ -221,7 +233,10 @@ void copyDataToOutputWithSignalSize(const float* input, const std::vector inputStridesRange(inputStrides.begin(), inputStrides.begin() + iterationRange.size()); const std::vector outputStridesRange(outputStrides.begin(), outputStrides.begin() + iterationRange.size()); - const size_t blockSize = std::accumulate(inputShape.begin() + lastChangedDim + 1, inputShape.end(), size_t(1), std::multiplies()); + const size_t blockSize = std::accumulate(inputShape.begin() + lastChangedDim + 1, + inputShape.end(), + size_t(1), + std::multiplies()); const size_t blockSizeBytes = blockSize * sizeof(float); std::vector iterationCounter(iterationRange.size(), 0); do { @@ -231,7 +246,7 @@ void copyDataToOutputWithSignalSize(const float* input, const std::vectorgetMemory().getStaticDims(); @@ -269,7 +284,8 @@ void DFT::execute(dnnl::stream strm) { if (inputShape != outputShape) { copyDataToOutputWithSignalSize(src, inputShape, inputStrides, dst, outputShape, outputStrides); } else { - auto totalElements = std::accumulate(inputShape.begin(), inputShape.end(), size_t(1), std::multiplies()); + auto totalElements = + std::accumulate(inputShape.begin(), inputShape.end(), size_t(1), std::multiplies()); cpu_memcpy(dst, src, totalElements * sizeof(float)); } @@ -315,17 +331,32 @@ void DFT::dftNd(float* output, std::vector gatheredData(outputLen * 2); auto parallelIterationCounter = iterationCounter; parallelIterationCounter[parallelDimIndex] = dim; - gatherToBufferND(gatheredData.data(), output, currentAxis, parallelIterationCounter, outputShape, outputStrides); + gatherToBufferND(gatheredData.data(), + output, + currentAxis, + parallelIterationCounter, + outputShape, + outputStrides); const float* resultBufPtr; fft(gatheredData.data(), gatheredData.data() + outputLen, outputLen, inverse, false, &resultBufPtr); - applyBufferND(resultBufPtr, output, currentAxis, parallelIterationCounter, outputShape, outputStrides); + applyBufferND(resultBufPtr, + output, + currentAxis, + parallelIterationCounter, + outputShape, + outputStrides); }); iterationCounter[parallelDimIndex] = iterationRange[parallelDimIndex] - 1; } while (nextIterationStep(iterationCounter, iterationRange, currentAxis)); } else { std::vector gatheredData(outputLen); do { - gatherToBufferND(gatheredData.data(), output, currentAxis, iterationCounter, outputShape, outputStrides); + gatherToBufferND(gatheredData.data(), + output, + currentAxis, + iterationCounter, + outputShape, + outputStrides); naiveDFT(gatheredData.data(), outputLen, inverse); applyBufferND(gatheredData.data(), output, currentAxis, iterationCounter, outputShape, outputStrides); } while (nextIterationStep(iterationCounter, iterationRange, currentAxis)); @@ -585,6 +616,6 @@ void DFT::createJITKernels(bool hasDFT, bool hasFFT) { } #endif } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/dft.h b/src/plugins/intel_cpu/src/nodes/dft.h index 82b6ea3b33a618..eef5e2ea529066 100644 --- a/src/plugins/intel_cpu/src/nodes/dft.h +++ b/src/plugins/intel_cpu/src/nodes/dft.h @@ -63,6 +63,6 @@ class DFT : public Node { bool lastInverse; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp index 54cf435009059d..5daefa01eddfab 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/eltwise.cpp @@ -3,6 +3,18 @@ // #include "eltwise.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "common/cpu_convert.h" #include "common/float16.hpp" #include "common/primitive_hashing_utils.hpp" @@ -10,6 +22,10 @@ #include "cpu/ref_eltwise.hpp" #include "cpu_types.h" #include "dnnl_extension_utils.h" +#include "emitters/plugin/x64/jit_bf16_emitters.hpp" +#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" +#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" +#include "emitters/plugin/x64/jit_emitter.hpp" #include "fake_quantize.h" #include "input.h" #include "memory_desc/dnnl_blocked_memory_desc.h" @@ -17,13 +33,13 @@ #include "onednn/dnnl.h" #include "openvino/core/except.hpp" #include "openvino/core/parallel.hpp" -#include "openvino/opsets/opset1.hpp" #include "openvino/op/bitwise_and.hpp" #include "openvino/op/bitwise_left_shift.hpp" #include "openvino/op/bitwise_not.hpp" #include "openvino/op/bitwise_or.hpp" #include "openvino/op/bitwise_right_shift.hpp" #include "openvino/op/bitwise_xor.hpp" +#include "openvino/opsets/opset1.hpp" #include "pooling.h" #include "selective_build.h" #include "shape_inference/custom/eltwise.hpp" @@ -35,27 +51,10 @@ #include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/plugin/x64/jit_eltwise_emitters.hpp" -#include "emitters/plugin/x64/jit_dnnl_emitters.hpp" -#include "emitters/plugin/x64/jit_bf16_emitters.hpp" - #if defined(OPENVINO_ARCH_ARM64) -#include "cpu/aarch64/cpu_isa_traits.hpp" -#include "kernels/aarch64/jit_uni_eltwise_generic.hpp" -#include "executors/aarch64/jit_eltwise.hpp" +# include "cpu/aarch64/cpu_isa_traits.hpp" +# include "executors/aarch64/jit_eltwise.hpp" +# include "kernels/aarch64/jit_uni_eltwise_generic.hpp" #endif using namespace dnnl::impl::utils; @@ -92,60 +91,72 @@ bool jitIsSupported(const Node* node, beta, gamma); } -} // namespace +} // namespace #endif #if defined(OPENVINO_ARCH_X86_64) -template +template struct SupportedPrecisions { - void operator()(std::set> &precisions) { + void operator()(std::set>& precisions) { precisions = T::get_supported_precisions(); } }; struct EltwiseEmitterContext { std::shared_ptr emitter; - jit_generator *host; + jit_generator* host; cpu_isa_t host_isa; const EltwiseData& opData; ov::element::Type exec_prc; }; -template +template struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { + void operator()(EltwiseEmitterContext& ctx) { ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { + void operator()(EltwiseEmitterContext& ctx) { auto algKind = static_cast(ctx.opData.onednnAlgorithm); - ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, algKind, - ctx.opData.alpha, ctx.opData.beta, ctx.exec_prc); + ctx.emitter = std::make_shared(ctx.host, + ctx.host_isa, + algKind, + ctx.opData.alpha, + ctx.opData.beta, + ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { - ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.opData.alpha, - ctx.opData.beta, ctx.opData.gamma, ctx.exec_prc); + void operator()(EltwiseEmitterContext& ctx) { + ctx.emitter = std::make_shared(ctx.host, + ctx.host_isa, + ctx.opData.alpha, + ctx.opData.beta, + ctx.opData.gamma, + ctx.exec_prc); } }; -template<> +template <> struct EltwiseEmitter { - void operator()(EltwiseEmitterContext & ctx) { - ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc, ctx.opData.alpha, ctx.opData.beta); + void operator()(EltwiseEmitterContext& ctx) { + ctx.emitter = std::make_shared(ctx.host, + ctx.host_isa, + ctx.exec_prc, + ctx.opData.alpha, + ctx.opData.beta); } }; static void set_intersection(const std::set>& precisions1, - const std::set>& precisions2, - std::set>& intersection) { + const std::set>& precisions2, + std::set>& intersection) { std::map intersection_types; for (auto it1 = precisions1.begin(); it1 != precisions1.end(); ++it1) { @@ -195,15 +206,8 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu supported_precision_intersection = prcs_intersect; } - static const element::Type exec_precisions_priority[] = { - element::u8, - element::i8, - element::u16, - element::i16, - element::bf16, - element::i32, - element::f32 - }; + static const element::Type exec_precisions_priority[] = + {element::u8, element::i8, element::u16, element::i16, element::bf16, element::i32, element::f32}; for (const auto prc : exec_precisions_priority) { if (std::any_of(supported_precision_intersection.begin(), @@ -234,59 +238,62 @@ ov::element::Type eltwise_precision_helper::get_precision(const size_t inputs_nu std::set> eltwise_precision_helper::get_supported_precisions(const Algorithm& algo) { std::set> precisions; - OV_SWITCH(intel_cpu, SupportedPrecisions, precisions, algo, - OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), - OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), - OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), - OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), - OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), - OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), - OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), - OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), - OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), - OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), - OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), - OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), - OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), - OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), - OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), - OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); + OV_SWITCH(intel_cpu, + SupportedPrecisions, + precisions, + algo, + OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), + OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), + OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), + OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), + OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), + OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), + OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), + OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), + OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), + OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), + OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), + OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), + OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), + OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); if (precisions.empty()) OPENVINO_THROW("Unsupported operation type for Eltwise emitter"); @@ -302,7 +309,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener const std::vector& eltwise_data, const std::vector& ops_list, const dnnl::post_ops& post_ops) - : jit_uni_eltwise_kernel(jep), jit_generator(jit_name()), eltwise_data_(eltwise_data), ops_list_(ops_list), post_ops_(post_ops) {} + : jit_uni_eltwise_kernel(jep), + jit_generator(jit_name()), + eltwise_data_(eltwise_data), + ops_list_(ops_list), + post_ops_(post_ops) {} void create_ker() override { jit_generator::create_kernel(); @@ -322,14 +333,18 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener if (!p->entry_[i].is_quantization()) { OPENVINO_THROW("Eltwise jitter error. Unsupported post op detected"); } - quantization_injectors.push_back(std::make_shared>( - this, p->entry_[i], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); + quantization_injectors.push_back(std::make_shared>(this, + p->entry_[i], + vmm_d_weights, + vmm_d_bias, + reg_d_weights, + reg_d_bias)); } if (mayiuse(avx512_core) || mayiuse(avx2_vnni_2)) uni_vcvtneps2bf16.reset(new jit_uni_vcvtneps2bf16(this, isa)); - const auto &jep = jep_; + const auto& jep = jep_; this->preamble(); @@ -435,7 +450,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener for (size_t j = 0; j < min_src_size / vec_step; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) - load_vector(get_vmm_reg(i), ptr[get_src_reg(i) + j * vec_step * jep.src_prc[i].size()], jep.src_prc[i], exec_prc, false); + load_vector(get_vmm_reg(i), + ptr[get_src_reg(i) + j * vec_step * jep.src_prc[i].size()], + jep.src_prc[i], + exec_prc, + false); } compute_eltwise_op(); @@ -449,7 +468,10 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener for (size_t j = tail_start; j < min_src_size; j++) { for (size_t i = 0; i < jep.inputs_number; i++) { if (jep.src_size[i] != 1) - load_scalar(get_xmm_reg(i), ptr[get_src_reg(i) + j * jep.src_prc[i].size()], jep.src_prc[i], exec_prc); + load_scalar(get_xmm_reg(i), + ptr[get_src_reg(i) + j * jep.src_prc[i].size()], + jep.src_prc[i], + exec_prc); } compute_eltwise_op(); @@ -571,7 +593,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } Reg64 reg_post_op_ptrs = rax; - Reg64 start_to_offsets = reg_post_op_ptrs; // rax + Reg64 start_to_offsets = reg_post_op_ptrs; // rax Reg64 reg_dst = rbx; Reg64 reg_work_amount = rdx; @@ -606,67 +628,64 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener const dnnl::post_ops& post_ops_; std::shared_ptr create_eltwise_emitter(const EltwiseData& data, ov::element::Type exec_prec) { - EltwiseEmitterContext ctx = { - nullptr, - this, - isa, - data, - exec_prec - }; - - OV_SWITCH(intel_cpu, EltwiseEmitter, ctx, data.algo, - OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), - OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), - OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), - OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), - OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), - OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), - OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), - OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), - OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), - OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), - OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), - OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), - OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), - OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), - OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), - OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), - OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), - OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), - OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), - OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), - OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), - OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), - OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), - OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), - OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), - OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), - OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), - OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), - OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), - OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), - OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), - OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), - OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), - OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), - OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), - OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), - OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); + EltwiseEmitterContext ctx = {nullptr, this, isa, data, exec_prec}; + + OV_SWITCH(intel_cpu, + EltwiseEmitter, + ctx, + data.algo, + OV_CASE(Algorithm::EltwiseRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluErf, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseGeluTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseElu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseTanh, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAbs, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSqrt, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSoftRelu, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseClamp, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseSwish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHswish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseMish, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseHsigmoid, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_dnnl_aux_emitter), + OV_CASE(Algorithm::EltwiseAdd, jit_add_emitter), + OV_CASE(Algorithm::EltwiseMulAdd, jit_mul_add_emitter), + OV_CASE(Algorithm::EltwiseSubtract, jit_subtract_emitter), + OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), + OV_CASE(Algorithm::EltwiseDivide, jit_divide_emitter), + OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), + OV_CASE(Algorithm::EltwiseMod, jit_mod_emitter), + OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), + OV_CASE(Algorithm::EltwiseMinimum, jit_minimum_emitter), + OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), + OV_CASE(Algorithm::EltwiseSquaredDifference, jit_squared_difference_emitter), + OV_CASE(Algorithm::EltwisePowerDynamic, jit_power_dynamic_emitter), + OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), + OV_CASE(Algorithm::EltwiseNotEqual, jit_not_equal_emitter), + OV_CASE(Algorithm::EltwiseGreater, jit_greater_emitter), + OV_CASE(Algorithm::EltwiseGreaterEqual, jit_greater_equal_emitter), + OV_CASE(Algorithm::EltwiseLess, jit_less_emitter), + OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), + OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), + OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), + OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), + OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), + OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), + OV_CASE(Algorithm::EltwiseErf, jit_erf_emitter), + OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), + OV_CASE(Algorithm::EltwiseIsFinite, jit_is_finite_emitter), + OV_CASE(Algorithm::EltwiseIsInf, jit_is_inf_emitter), + OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), + OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), + OV_CASE(Algorithm::EltwiseBitwiseAnd, jit_bitwise_and_emitter), + OV_CASE(Algorithm::EltwiseBitwiseNot, jit_bitwise_not_emitter), + OV_CASE(Algorithm::EltwiseBitwiseOr, jit_bitwise_or_emitter), + OV_CASE(Algorithm::EltwiseBitwiseXor, jit_bitwise_xor_emitter)); if (!ctx.emitter) OPENVINO_THROW("Unsupported operation type for Eltwise emitter"); @@ -714,17 +733,31 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener bool do_rounding = do_dequantization || jep_.dst_prc == ov::element::f32 || i != ops_list_.size() - 1; int s_idx = vmm_dst.getIdx(); - size_t ptrs_table_off = quantization_post_op_idx * quantization_injectors[quantization_post_op_idx]->memoryStep(); - - quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); - quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); - - quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); - quantization_injectors[quantization_post_op_idx]->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding, - is_scalar, jep_.oc_size == 1); - - quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); - quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); + size_t ptrs_table_off = + quantization_post_op_idx * quantization_injectors[quantization_post_op_idx]->memoryStep(); + + quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_post_op_ptrs + ptrs_table_off, + reg_oc_off); + quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, + s_idx + 1, + offset, + is_scalar, + jep_.oc_size == 1); + + quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs( + reg_post_op_ptrs + ptrs_table_off, + reg_oc_off); + quantization_injectors[quantization_post_op_idx] + ->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding, is_scalar, jep_.oc_size == 1); + + quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs( + reg_post_op_ptrs + ptrs_table_off, + reg_oc_off); + quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, + s_idx + 1, + offset, + is_scalar, + jep_.oc_size == 1); quantization_post_op_idx++; } else { @@ -733,7 +766,11 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } } - inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, ov::element::Type src_prc, ov::element::Type dst_prc, bool broadcast) { + inline void load_vector(Vmm vmm_src, + const Xbyak::Address& op, + ov::element::Type src_prc, + ov::element::Type dst_prc, + bool broadcast) { Xmm xmm_src = Xmm(vmm_src.getIdx()); if (src_prc == dst_prc) { @@ -751,120 +788,126 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener uni_vbroadcastss(vmm_src, xmm_src); } else { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(vmm_src, op); - break; - case ov::element::bf16: - vpmovzxwd(vmm_src, op); - uni_vpslld(vmm_src, vmm_src, 16); - break; - case ov::element::f16: - vcvtph2ps(vmm_src, op); - break; - case ov::element::u16: - uni_vpmovzxwd(vmm_src, op); - break; - case ov::element::i16: - uni_vpmovsxwd(vmm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(vmm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(vmm_src, op); - break; - default: - OPENVINO_THROW("unknown src_prc"); - } - - switch (dst_prc) { - case ov::element::f32: - if (!src_prc.is_real()) - uni_vcvtdq2ps(vmm_src, vmm_src); - break; - case ov::element::i32: - if (src_prc.is_real()) - uni_vcvtps2dq(vmm_src, vmm_src); - break; - default: - OPENVINO_THROW("unknown dst_prc"); - } - } - } - - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, ov::element::Type src_prc, ov::element::Type dst_prc) { - if (src_prc == dst_prc) { - switch (src_prc.size()) { - case 4: - uni_vmovss(xmm_src, op); - break; - case 1: - mov(reg_tmp_8, op); - movzx(reg_tmp_32, reg_tmp_8); - uni_vmovd(xmm_src, reg_tmp_32); - break; - default: - OPENVINO_THROW("unknown prc"); - } - return; - } - - switch (src_prc) { case ov::element::f32: case ov::element::i32: - uni_vmovss(xmm_src, op); + uni_vmovups(vmm_src, op); break; case ov::element::bf16: - if (isa == x64::avx2_vnni_2) { - vbcstnebf162ps(xmm_src, op); - } else { - uni_vpinsrw(xmm_src, xmm_src, op, 0); - uni_vpslld(xmm_src, xmm_src, 16); - } + vpmovzxwd(vmm_src, op); + uni_vpslld(vmm_src, vmm_src, 16); break; case ov::element::f16: - if (isa == x64::avx2_vnni_2) { - vbcstnesh2ps(xmm_src, op); - } else { - vcvtph2ps(xmm_src, op); - } - break; - case ov::element::i16: - uni_vpinsrw(xmm_src, xmm_src, op, 0); - uni_vpmovsxwd(xmm_src, op); + vcvtph2ps(vmm_src, op); break; case ov::element::u16: - uni_vpinsrw(xmm_src, xmm_src, op, 0); - uni_vpmovzxwd(xmm_src, op); + uni_vpmovzxwd(vmm_src, op); + break; + case ov::element::i16: + uni_vpmovsxwd(vmm_src, op); break; case ov::element::i8: - movsx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); + uni_vpmovsxbd(vmm_src, op); break; case ov::element::u8: - movzx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); + uni_vpmovzxbd(vmm_src, op); break; default: OPENVINO_THROW("unknown src_prc"); - } + } - switch (dst_prc) { + switch (dst_prc) { case ov::element::f32: if (!src_prc.is_real()) - uni_vcvtdq2ps(xmm_src, xmm_src); + uni_vcvtdq2ps(vmm_src, vmm_src); break; case ov::element::i32: if (src_prc.is_real()) - uni_vcvtps2dq(xmm_src, xmm_src); + uni_vcvtps2dq(vmm_src, vmm_src); break; default: OPENVINO_THROW("unknown dst_prc"); + } + } + } + + inline void load_scalar(Xmm xmm_src, + const Xbyak::Address& op, + ov::element::Type src_prc, + ov::element::Type dst_prc) { + if (src_prc == dst_prc) { + switch (src_prc.size()) { + case 4: + uni_vmovss(xmm_src, op); + break; + case 1: + mov(reg_tmp_8, op); + movzx(reg_tmp_32, reg_tmp_8); + uni_vmovd(xmm_src, reg_tmp_32); + break; + default: + OPENVINO_THROW("unknown prc"); + } + return; + } + + switch (src_prc) { + case ov::element::f32: + case ov::element::i32: + uni_vmovss(xmm_src, op); + break; + case ov::element::bf16: + if (isa == x64::avx2_vnni_2) { + vbcstnebf162ps(xmm_src, op); + } else { + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpslld(xmm_src, xmm_src, 16); + } + break; + case ov::element::f16: + if (isa == x64::avx2_vnni_2) { + vbcstnesh2ps(xmm_src, op); + } else { + vcvtph2ps(xmm_src, op); + } + break; + case ov::element::i16: + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpmovsxwd(xmm_src, op); + break; + case ov::element::u16: + uni_vpinsrw(xmm_src, xmm_src, op, 0); + uni_vpmovzxwd(xmm_src, op); + break; + case ov::element::i8: + movsx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + case ov::element::u8: + movzx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + default: + OPENVINO_THROW("unknown src_prc"); + } + + switch (dst_prc) { + case ov::element::f32: + if (!src_prc.is_real()) + uni_vcvtdq2ps(xmm_src, xmm_src); + break; + case ov::element::i32: + if (src_prc.is_real()) + uni_vcvtps2dq(xmm_src, xmm_src); + break; + default: + OPENVINO_THROW("unknown dst_prc"); } } - inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, ov::element::Type src_prc, ov::element::Type dst_prc) { + inline void store_vector(const Xbyak::Address& op, + Vmm vmm_dst, + ov::element::Type src_prc, + ov::element::Type dst_prc) { Xmm xmm_dst = Xmm(vmm_dst.getIdx()); Ymm ymm_dst = Ymm(vmm_dst.getIdx()); @@ -874,170 +917,173 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener } switch (src_prc) { - case ov::element::f32: - if (!dst_prc.is_real()) - uni_vcvtps2dq(vmm_dst, vmm_dst); - break; - case ov::element::i32: - if (dst_prc.is_real()) - uni_vcvtdq2ps(vmm_dst, vmm_dst); - break; - default: - OPENVINO_THROW("unknown src_prc"); + case ov::element::f32: + if (!dst_prc.is_real()) + uni_vcvtps2dq(vmm_dst, vmm_dst); + break; + case ov::element::i32: + if (dst_prc.is_real()) + uni_vcvtdq2ps(vmm_dst, vmm_dst); + break; + default: + OPENVINO_THROW("unknown src_prc"); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, vmm_dst); - break; - case ov::element::bf16: - if (isa == x64::avx512_core) { - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); - } else { - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(xmm_dst.getIdx())}); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, vmm_dst); + break; + case ov::element::bf16: + if (isa == x64::avx512_core) { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + } else { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); + uni_vmovdqu(op, xmm_dst); + } + break; + case ov::element::f16: + vcvtps2ph(op, vmm_dst, 0x4); + break; + case ov::element::i16: + if (isa == x64::avx512_core) { + vpmovsdw(op, vmm_dst); + } else { + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) { + vpermq(ymm_dst, ymm_dst, 0x08); uni_vmovdqu(op, xmm_dst); - } - break; - case ov::element::f16: - vcvtps2ph(op, vmm_dst, 0x4); - break; - case ov::element::i16: - if (isa == x64::avx512_core) { - vpmovsdw(op, vmm_dst); } else { - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) { - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vmovdqu(op, xmm_dst); - } else { - movq(op, xmm_dst); - } + movq(op, xmm_dst); } - break; - case ov::element::u16: - if (isa == x64::avx512_core) { - vpmaxsd(vmm_dst, vmm_zero, vmm_dst); - vpmovusdw(op, vmm_dst); - } else { - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) { - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vmovdqu(op, xmm_dst); - } else { - movq(op, xmm_dst); - } - } - break; - case ov::element::i8: - if (isa == x64::avx512_core) { - vpmovsdb(op, vmm_dst); - } else { - uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); - } - break; - case ov::element::u8: - if (isa == x64::avx512_core) { - vpmaxsd(vmm_dst, vmm_zero, vmm_dst); - vpmovusdb(op, vmm_dst); + } + break; + case ov::element::u16: + if (isa == x64::avx512_core) { + vpmaxsd(vmm_dst, vmm_zero, vmm_dst); + vpmovusdw(op, vmm_dst); + } else { + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) { + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vmovdqu(op, xmm_dst); } else { - uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); - if (isa != x64::sse41) - vmovq(op, xmm_dst); - else - movd(op, xmm_dst); + movq(op, xmm_dst); } - break; - default: - OPENVINO_THROW("unknown dst_prc"); + } + break; + case ov::element::i8: + if (isa == x64::avx512_core) { + vpmovsdb(op, vmm_dst); + } else { + uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + case ov::element::u8: + if (isa == x64::avx512_core) { + vpmaxsd(vmm_dst, vmm_zero, vmm_dst); + vpmovusdb(op, vmm_dst); + } else { + uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vpermq(ymm_dst, ymm_dst, 0x08); + uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst); + if (isa != x64::sse41) + vmovq(op, xmm_dst); + else + movd(op, xmm_dst); + } + break; + default: + OPENVINO_THROW("unknown dst_prc"); } } - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, ov::element::Type src_prc, ov::element::Type dst_prc) { + inline void store_scalar(const Xbyak::Address& op, + Xmm xmm_dst, + ov::element::Type src_prc, + ov::element::Type dst_prc) { if (src_prc == dst_prc) { switch (src_prc.size()) { - case 4: - uni_vmovss(op, xmm_dst); - break; - case 1: - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - OPENVINO_THROW("unknown prc"); + case 4: + uni_vmovss(op, xmm_dst); + break; + case 1: + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + OPENVINO_THROW("unknown prc"); } return; } switch (src_prc) { - case ov::element::f32: - if (!dst_prc.is_real()) - uni_vcvtps2dq(xmm_dst, xmm_dst); - break; - case ov::element::i32: - if (dst_prc.is_real()) - uni_vcvtdq2ps(xmm_dst, xmm_dst); - break; - default: - OPENVINO_THROW("unknown src_prc"); + case ov::element::f32: + if (!dst_prc.is_real()) + uni_vcvtps2dq(xmm_dst, xmm_dst); + break; + case ov::element::i32: + if (dst_prc.is_real()) + uni_vcvtdq2ps(xmm_dst, xmm_dst); + break; + default: + OPENVINO_THROW("unknown src_prc"); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovss(op, xmm_dst); - break; - case ov::element::bf16: - uni_vpsrld(xmm_dst, xmm_dst, 16); - uni_vpextrw(op, xmm_dst, 0x0); - break; - case ov::element::f16: - vcvtps2ph(xmm_dst, xmm_dst, 0x4); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_16); - break; - case ov::element::i16: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_16); - break; - case ov::element::u16: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_16); - break; - case ov::element::i8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - case ov::element::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - movq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - OPENVINO_THROW("unknown dst_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovss(op, xmm_dst); + break; + case ov::element::bf16: + uni_vpsrld(xmm_dst, xmm_dst, 16); + uni_vpextrw(op, xmm_dst, 0x0); + break; + case ov::element::f16: + vcvtps2ph(xmm_dst, xmm_dst, 0x4); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_16); + break; + case ov::element::i16: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_16); + break; + case ov::element::u16: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_16); + break; + case ov::element::i8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case ov::element::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + movq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + OPENVINO_THROW("unknown dst_prc"); } } }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 Eltwise::BroadcastingPolicy Eltwise::determineBroadcastingPolicy(const std::shared_ptr& op) { const auto const1 = ov::as_type_ptr(op->get_input_node_shared_ptr(0)); @@ -1297,7 +1343,6 @@ const std::map& Eltwise::getIn return initializers; } - namespace { struct EltwiseKey { @@ -1353,12 +1398,8 @@ struct EltwiseKey { return false; } - bool result = eltwise_data == rhs.eltwise_data && - ops_list == rhs.ops_list && - inpPrc == rhs.inpPrc && - outPrc == rhs.outPrc && - *postOps.get() == *rhs.postOps.get() && - implType == rhs.implType; + bool result = eltwise_data == rhs.eltwise_data && ops_list == rhs.ops_list && inpPrc == rhs.inpPrc && + outPrc == rhs.outPrc && *postOps.get() == *rhs.postOps.get() && implType == rhs.implType; if (result) { if (implType == EltwiseImplType::optimizedShapeAgnostic) { @@ -1370,8 +1411,7 @@ struct EltwiseKey { return false; } } else { - result = result && outOrder == rhs.outOrder && - outBlkDims == rhs.outBlkDims; + result = result && outOrder == rhs.outOrder && outBlkDims == rhs.outBlkDims; for (size_t i = 0; i < inpDims.size() && result; ++i) { result = result && (inpDims[i] == rhs.inpDims[i]); } @@ -1426,7 +1466,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { auto collapseLastOffsets = [](std::vector& dims, int dimsToCollapse) { for (size_t i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { if (dims[dims.size() - 1] > 0 || dims[i] > 0) - dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast(1)) * std::max(dims[i], static_cast(1)); + dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast(1)) * + std::max(dims[i], static_cast(1)); else dims[dims.size() - 1] *= dims[i]; } @@ -1442,8 +1483,10 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { auto isFusedWith = [&](Type type_) { auto start_itr = ops_list.begin(); - std::advance(start_itr, 1); // apply offset since the first op in the list is the op itself - return any_of(start_itr, ops_list.end(), [=](Type type) { return type == type_; }); + std::advance(start_itr, 1); // apply offset since the first op in the list is the op itself + return any_of(start_itr, ops_list.end(), [=](Type type) { + return type == type_; + }); }; if (inpDims.empty()) { @@ -1493,7 +1536,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { int oc_dim_idx = i + (jep.input_size - outOrder.size()); jep.oc_offsets[oc_dim_idx] = offset_oc; offset_oc *= jep.dims[oc_dim_idx]; - if (oc_dim_idx + 1 != static_cast(jep.input_size)) { // since in nspc case we can safely collapse the last axis + if (oc_dim_idx + 1 != + static_cast(jep.input_size)) { // since in nspc case we can safely collapse the last axis lastUnchangedAxis = oc_dim_idx; } } @@ -1514,7 +1558,8 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { int collapsedDims = 0; bool hasDifferentDims = false; - while (!useRuntimePtrs && currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) { + while (!useRuntimePtrs && currentJitWorkAmount < minimalJitWorkAmount && + currentJitWorkAmount < fullWorkAmount) { if (collapsedDims >= maxCollapsedDims) break; @@ -1595,8 +1640,9 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { jep.work_amount = jep.dst_size = jep.dims.back(); jep.oc_size = oc_size; - std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), - [](size_t& offset) { return offset * sizeof(float);}); + std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), [](size_t& offset) { + return offset * sizeof(float); + }); #if defined(OPENVINO_ARCH_X86_64) if (mayiuse(x64::avx512_core)) { @@ -1608,7 +1654,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } else { OPENVINO_THROW("Can't create jit eltwise kernel"); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 #if defined(OPENVINO_ARCH_ARM64) if (mayiuse(aarch64::asimd)) { @@ -1616,28 +1662,28 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { } else { OPENVINO_THROW("Can't create jit eltwise kernel"); } -#endif // OPENVINO_ARCH_ARM64 +#endif // OPENVINO_ARCH_ARM64 if (_pKernel) _pKernel->create_ker(); } - void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) override { if (!_pKernel) OPENVINO_THROW("Can't execute, kernel for eltwise node is not compiled"); if (_pKernel->jep_.input_size == optimalTensorRank) { // execute Optimized 6D auto d6_loop = [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { - auto args = jit_eltwise_call_args_indexes(); - args.indexes[0] = i0; - args.indexes[1] = i1; - args.indexes[2] = i2; - args.indexes[3] = i3; - args.indexes[4] = i4; + auto args = jit_eltwise_call_args_indexes(); + args.indexes[0] = i0; + args.indexes[1] = i1; + args.indexes[2] = i2; + args.indexes[3] = i3; + args.indexes[4] = i4; - (*_pKernel)(&args_ptrs, &args); - }; + (*_pKernel)(&args_ptrs, &args); + }; parallel_nt_static(m_threads_num, [&](const int ithr, const int nthr) { for_5d(ithr, nthr, dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], d6_loop); @@ -1693,13 +1739,14 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor { /* enabled only for float at float16_t at the moment * can be extended in the future */ -template +template class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { public: EltwiseRefBaseExecutor(const EltwiseData& opData, const VectorDims& outBlkDims, const std::vector& inpDims) - : _opData(std::move(opData)), _inpDims(inpDims) { + : _opData(std::move(opData)), + _inpDims(inpDims) { if (inpDims.empty()) { OPENVINO_THROW("Can not make Eltwise executor from empty input dims array"); } else if (inpDims.front().empty()) { @@ -1750,18 +1797,18 @@ class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { protected: void init_ptr(const jit_eltwise_call_args_ptrs& args_ptrs, - const VectorDims& dims_out, - std::vector& counters, - const size_t iwork, - std::vector& src_f, - T*& dst_ptr_f) { + const VectorDims& dims_out, + std::vector& counters, + const size_t iwork, + std::vector& src_f, + T*& dst_ptr_f) { size_t tmp = iwork; for (ptrdiff_t j = dims_out.size() - 1; j >= 0; j--) { counters[j] = tmp % dims_out[j]; tmp /= dims_out[j]; } - size_t index_in[MAX_ELTWISE_INPUTS] = { 0 }; + size_t index_in[MAX_ELTWISE_INPUTS] = {0}; for (size_t i = 0; i < _inputNum; i++) { index_in[i] = 0; for (size_t j = 0; j < counters.size(); j++) { @@ -1776,7 +1823,7 @@ class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { } index_out /= sizeof(T); - //std::vector src_f(_inputNum); + // std::vector src_f(_inputNum); for (size_t i = 0; i < _inputNum; i++) { src_f[i] = (reinterpret_cast(args_ptrs.src_ptr[i]) + index_in[i])[0]; } @@ -1795,19 +1842,15 @@ class EltwiseRefBaseExecutor : public Eltwise::IEltwiseExecutor { /* enabled only for float at float16_t at the moment * can be extended in the future */ -template::value || - std::is_same::value> - ::type * = nullptr> +template ::value || + std::is_same::value>::type* = nullptr> class EltwiseRefExecutor : public EltwiseRefBaseExecutor { public: - EltwiseRefExecutor(const EltwiseData& opData, - const VectorDims& outBlkDims, - std::vector inpDims) : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) { - } + EltwiseRefExecutor(const EltwiseData& opData, const VectorDims& outBlkDims, std::vector inpDims) + : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) {} - void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) override { if (this->_opData.algo == Algorithm::EltwiseLog) { const T* src_ptr_f = reinterpret_cast(args_ptrs.src_ptr[0]); T* dst_ptr_f = reinterpret_cast(args_ptrs.dst_ptr); @@ -1857,8 +1900,11 @@ class EltwiseRefExecutor : public EltwiseRefBaseExecutor { std::shared_ptr ref_eltwise_injector = nullptr; if (this->_opData.onednnAlgorithm != dnnl::algorithm::undef) { - ref_eltwise_injector = std::make_shared( - static_cast(this->_opData.onednnAlgorithm), this->_opData.alpha, this->_opData.beta, 1.f); + ref_eltwise_injector = + std::make_shared(static_cast(this->_opData.onednnAlgorithm), + this->_opData.alpha, + this->_opData.beta, + 1.f); } parallel_nt(0, [&](const int ithr, const int nthr) { @@ -1873,86 +1919,144 @@ class EltwiseRefExecutor : public EltwiseRefBaseExecutor { this->init_ptr(args_ptrs, dims_out, counters, iwork, src_f, dst_ptr_f); switch (this->_opData.algo) { - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseGeluTanh: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseAbs: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - case Algorithm::EltwiseMish: - case Algorithm::EltwiseHsigmoid: - case Algorithm::EltwiseRoundHalfToEven: - case Algorithm::EltwiseRoundHalfAwayFromZero: - *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); - break; - case Algorithm::EltwiseAdd: *dst_ptr_f = src_f[0] + src_f[1]; break; - case Algorithm::EltwiseMulAdd: *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; break; - case Algorithm::EltwiseSubtract: *dst_ptr_f = src_f[0] - src_f[1]; break; - case Algorithm::EltwiseMultiply: *dst_ptr_f = src_f[0] * src_f[1]; break; - case Algorithm::EltwiseDivide: *dst_ptr_f = src_f[0] / src_f[1]; break; - case Algorithm::EltwiseCeiling: *dst_ptr_f = ceilf(src_f[0]); break; - case Algorithm::EltwiseFloor: *dst_ptr_f = floorf(src_f[0]); break; - case Algorithm::EltwiseFloorMod: *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; break; - case Algorithm::EltwiseMod: *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; break; - case Algorithm::EltwiseMaximum: *dst_ptr_f = std::max(src_f[0], src_f[1]); break; - case Algorithm::EltwiseMinimum: *dst_ptr_f = std::min(src_f[0], src_f[1]); break; - case Algorithm::EltwiseExp: *dst_ptr_f = expf(src_f[0]); break; - case Algorithm::EltwiseSquaredDifference: *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); break; - case Algorithm::EltwisePowerDynamic: *dst_ptr_f = powf(src_f[0], src_f[1]); break; - case Algorithm::EltwiseEqual: *dst_ptr_f = src_f[0] == src_f[1]; break; - case Algorithm::EltwiseNotEqual: *dst_ptr_f = src_f[0] != src_f[1]; break; - case Algorithm::EltwiseGreater: *dst_ptr_f = src_f[0] > src_f[1]; break; - case Algorithm::EltwiseGreaterEqual: *dst_ptr_f = src_f[0] >= src_f[1]; break; - case Algorithm::EltwiseLess: *dst_ptr_f = src_f[0] < src_f[1]; break; - case Algorithm::EltwiseLessEqual: *dst_ptr_f = src_f[0] <= src_f[1]; break; - case Algorithm::EltwiseLogicalAnd: *dst_ptr_f = src_f[0] && src_f[1]; break; - case Algorithm::EltwiseLogicalOr: *dst_ptr_f = src_f[0] || src_f[1]; break; - case Algorithm::EltwiseLogicalXor: *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break; - case Algorithm::EltwiseLogicalNot: *dst_ptr_f = !src_f[0]; break; - case Algorithm::EltwisePrelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : static_cast(src_f[0] * src_f[1]); break; - case Algorithm::EltwiseErf: *dst_ptr_f = std::erf(src_f[0]); break; - case Algorithm::EltwiseSoftSign: *dst_ptr_f = src_f[0] / (1 + std::fabs(src_f[0])); break; - // @todo implement proper isinfinite for non-float precisions - case Algorithm::EltwiseIsFinite: *dst_ptr_f = std::isfinite(static_cast(src_f[0])); break; - case Algorithm::EltwiseIsInf: - *dst_ptr_f = (this->_opData.alpha && (src_f[0] == -std::numeric_limits::infinity())) || - (this->_opData.beta && (src_f[0] == std::numeric_limits::infinity())); - break; - case Algorithm::EltwiseIsNaN: *dst_ptr_f = std::isnan(src_f[0]); break; - case Algorithm::EltwiseSelect: *dst_ptr_f = src_f[0] ? src_f[1] : src_f[2]; break; - default: OPENVINO_THROW("Unsupported operation type for Eltwise executor"); + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseGeluTanh: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseAbs: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + case Algorithm::EltwiseMish: + case Algorithm::EltwiseHsigmoid: + case Algorithm::EltwiseRoundHalfToEven: + case Algorithm::EltwiseRoundHalfAwayFromZero: + *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); + break; + case Algorithm::EltwiseAdd: + *dst_ptr_f = src_f[0] + src_f[1]; + break; + case Algorithm::EltwiseMulAdd: + *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; + break; + case Algorithm::EltwiseSubtract: + *dst_ptr_f = src_f[0] - src_f[1]; + break; + case Algorithm::EltwiseMultiply: + *dst_ptr_f = src_f[0] * src_f[1]; + break; + case Algorithm::EltwiseDivide: + *dst_ptr_f = src_f[0] / src_f[1]; + break; + case Algorithm::EltwiseCeiling: + *dst_ptr_f = ceilf(src_f[0]); + break; + case Algorithm::EltwiseFloor: + *dst_ptr_f = floorf(src_f[0]); + break; + case Algorithm::EltwiseFloorMod: + *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; + break; + case Algorithm::EltwiseMod: + *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; + break; + case Algorithm::EltwiseMaximum: + *dst_ptr_f = std::max(src_f[0], src_f[1]); + break; + case Algorithm::EltwiseMinimum: + *dst_ptr_f = std::min(src_f[0], src_f[1]); + break; + case Algorithm::EltwiseExp: + *dst_ptr_f = expf(src_f[0]); + break; + case Algorithm::EltwiseSquaredDifference: + *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); + break; + case Algorithm::EltwisePowerDynamic: + *dst_ptr_f = powf(src_f[0], src_f[1]); + break; + case Algorithm::EltwiseEqual: + *dst_ptr_f = src_f[0] == src_f[1]; + break; + case Algorithm::EltwiseNotEqual: + *dst_ptr_f = src_f[0] != src_f[1]; + break; + case Algorithm::EltwiseGreater: + *dst_ptr_f = src_f[0] > src_f[1]; + break; + case Algorithm::EltwiseGreaterEqual: + *dst_ptr_f = src_f[0] >= src_f[1]; + break; + case Algorithm::EltwiseLess: + *dst_ptr_f = src_f[0] < src_f[1]; + break; + case Algorithm::EltwiseLessEqual: + *dst_ptr_f = src_f[0] <= src_f[1]; + break; + case Algorithm::EltwiseLogicalAnd: + *dst_ptr_f = src_f[0] && src_f[1]; + break; + case Algorithm::EltwiseLogicalOr: + *dst_ptr_f = src_f[0] || src_f[1]; + break; + case Algorithm::EltwiseLogicalXor: + *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); + break; + case Algorithm::EltwiseLogicalNot: + *dst_ptr_f = !src_f[0]; + break; + case Algorithm::EltwisePrelu: + *dst_ptr_f = src_f[0] > 0 ? src_f[0] : static_cast(src_f[0] * src_f[1]); + break; + case Algorithm::EltwiseErf: + *dst_ptr_f = std::erf(src_f[0]); + break; + case Algorithm::EltwiseSoftSign: + *dst_ptr_f = src_f[0] / (1 + std::fabs(src_f[0])); + break; + // @todo implement proper isinfinite for non-float precisions + case Algorithm::EltwiseIsFinite: + *dst_ptr_f = std::isfinite(static_cast(src_f[0])); + break; + case Algorithm::EltwiseIsInf: + *dst_ptr_f = (this->_opData.alpha && (src_f[0] == -std::numeric_limits::infinity())) || + (this->_opData.beta && (src_f[0] == std::numeric_limits::infinity())); + break; + case Algorithm::EltwiseIsNaN: + *dst_ptr_f = std::isnan(src_f[0]); + break; + case Algorithm::EltwiseSelect: + *dst_ptr_f = src_f[0] ? src_f[1] : src_f[2]; + break; + default: + OPENVINO_THROW("Unsupported operation type for Eltwise executor"); } } }); } }; -template::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value> - ::type * = nullptr> +template ::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value>::type* = nullptr> class BitwiseRefExecutor : public EltwiseRefBaseExecutor { public: - BitwiseRefExecutor(const EltwiseData& opData, - const VectorDims& outBlkDims, - const std::vector& inpDims) : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) { - } + BitwiseRefExecutor(const EltwiseData& opData, const VectorDims& outBlkDims, const std::vector& inpDims) + : EltwiseRefBaseExecutor(opData, outBlkDims, inpDims) {} - void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) override { std::shared_ptr ref_eltwise_injector = nullptr; if (this->_opData.onednnAlgorithm != dnnl::algorithm::undef) { - ref_eltwise_injector = std::make_shared( - static_cast(this->_opData.onednnAlgorithm), this->_opData.alpha, this->_opData.beta, 1.f); + ref_eltwise_injector = + std::make_shared(static_cast(this->_opData.onednnAlgorithm), + this->_opData.alpha, + this->_opData.beta, + 1.f); } parallel_nt(0, [&](const int ithr, const int nthr) { @@ -1967,81 +2071,79 @@ class BitwiseRefExecutor : public EltwiseRefBaseExecutor { this->init_ptr(args_ptrs, dims_out, counters, iwork, src_f, dst_ptr_f); switch (this->_opData.algo) { - case Algorithm::EltwiseBitwiseAnd: { - *dst_ptr_f = src_f[0] & src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseNot: { - *dst_ptr_f = ~src_f[0]; - break; - } - case Algorithm::EltwiseBitwiseOr: { - *dst_ptr_f = src_f[0] | src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseXor: { - *dst_ptr_f = src_f[0] ^ src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseLeftShift: { - *dst_ptr_f = src_f[0] << src_f[1]; - break; - } - case Algorithm::EltwiseBitwiseRightShift: { - *dst_ptr_f = src_f[0] >> src_f[1]; - break; - } - default: - OPENVINO_THROW("Unsupported operation type for Eltwise executor"); + case Algorithm::EltwiseBitwiseAnd: { + *dst_ptr_f = src_f[0] & src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseNot: { + *dst_ptr_f = ~src_f[0]; + break; + } + case Algorithm::EltwiseBitwiseOr: { + *dst_ptr_f = src_f[0] | src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseXor: { + *dst_ptr_f = src_f[0] ^ src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseLeftShift: { + *dst_ptr_f = src_f[0] << src_f[1]; + break; + } + case Algorithm::EltwiseBitwiseRightShift: { + *dst_ptr_f = src_f[0] >> src_f[1]; + break; + } + default: + OPENVINO_THROW("Unsupported operation type for Eltwise executor"); } } }); } }; -} // namespace +} // namespace static Eltwise::executorPtr buildRefExecutor(const EltwiseKey& key) { switch (key.outPrc) { - case ov::element::f16: - return std::make_shared>(key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - case ov::element::i8: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - - case ov::element::u8: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - - case ov::element::i16: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); - - case ov::element::u16: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); + case ov::element::f16: + return std::make_shared>(key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + case ov::element::i8: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + + case ov::element::u8: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + + case ov::element::i16: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + + case ov::element::u16: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); # - case ov::element::i32: - return std::make_shared::value_type>>( - key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); + case ov::element::i32: + return std::make_shared::value_type>>( + key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); - default: - // use float reference executor for any other precision for now - return std::make_shared>(key.eltwise_data.front(), - key.outBlkDims, - key.inpDims); + default: + // use float reference executor for any other precision for now + return std::make_shared>(key.eltwise_data.front(), key.outBlkDims, key.inpDims); } } @@ -2064,7 +2166,7 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) { bool Eltwise::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (getInitializers().find(op->get_type_info()) == getInitializers().end()) { - errorMessage = "Doesn't support Eltwise algorithm: " + std::string(op->get_type_name()); + errorMessage = "Doesn't support Eltwise algorithm: " + std::string(op->get_type_name()); return false; } if (const auto binOp = ov::as_type_ptr(op)) { @@ -2087,8 +2189,9 @@ bool Eltwise::isSupportedOperation(const std::shared_ptr& op, st return true; } -Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, EltwiseShapeInferFactory()), broadcastingPolicy(Undefined) { +Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, EltwiseShapeInferFactory()), + broadcastingPolicy(Undefined) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -2098,67 +2201,68 @@ Eltwise::Eltwise(const std::shared_ptr& op, const GraphContext::CPtr c size_t Eltwise::getOpInputsNum() const { switch (getAlgorithm()) { - case Algorithm::EltwiseIsFinite: - case Algorithm::EltwiseIsInf: - case Algorithm::EltwiseIsNaN: - case Algorithm::EltwiseRelu: - case Algorithm::EltwiseGeluErf: - case Algorithm::EltwiseGeluTanh: - case Algorithm::EltwiseCeiling: - case Algorithm::EltwiseFloor: - case Algorithm::EltwiseElu: - case Algorithm::EltwiseTanh: - case Algorithm::EltwiseSigmoid: - case Algorithm::EltwiseAbs: - case Algorithm::EltwiseSqrt: - case Algorithm::EltwiseSoftRelu: - case Algorithm::EltwiseExp: - case Algorithm::EltwiseClamp: - case Algorithm::EltwiseErf: - case Algorithm::EltwiseLogicalNot: - case Algorithm::EltwisePowerStatic: - case Algorithm::EltwiseSwish: - case Algorithm::EltwiseHswish: - case Algorithm::EltwiseMish: - case Algorithm::EltwiseHsigmoid: - case Algorithm::EltwiseRoundHalfToEven: - case Algorithm::EltwiseRoundHalfAwayFromZero: - case Algorithm::EltwiseSoftSign: - case Algorithm::EltwiseLog: - return 1; - case Algorithm::EltwiseAdd: - case Algorithm::EltwiseSubtract: - case Algorithm::EltwiseMultiply: - case Algorithm::EltwiseDivide: - case Algorithm::EltwiseFloorMod: - case Algorithm::EltwiseMod: - case Algorithm::EltwiseMaximum: - case Algorithm::EltwiseMinimum: - case Algorithm::EltwiseSquaredDifference: - case Algorithm::EltwisePowerDynamic: - case Algorithm::EltwiseEqual: - case Algorithm::EltwiseNotEqual: - case Algorithm::EltwiseGreater: - case Algorithm::EltwiseGreaterEqual: - case Algorithm::EltwiseLess: - case Algorithm::EltwiseLessEqual: - case Algorithm::EltwiseLogicalAnd: - case Algorithm::EltwiseLogicalOr: - case Algorithm::EltwiseLogicalXor: - case Algorithm::EltwiseBitwiseAnd: - case Algorithm::EltwiseBitwiseOr: - case Algorithm::EltwiseBitwiseXor: - case Algorithm::EltwiseBitwiseLeftShift: - case Algorithm::EltwiseBitwiseRightShift: - return 2; - case Algorithm::EltwiseBitwiseNot: - return 1; - case Algorithm::EltwisePrelu: - return 2; - case Algorithm::EltwiseMulAdd: - case Algorithm::EltwiseSelect: - return 3; - default: OPENVINO_THROW("Unsupported operation for Eltwise node with name `", getName(), "`."); + case Algorithm::EltwiseIsFinite: + case Algorithm::EltwiseIsInf: + case Algorithm::EltwiseIsNaN: + case Algorithm::EltwiseRelu: + case Algorithm::EltwiseGeluErf: + case Algorithm::EltwiseGeluTanh: + case Algorithm::EltwiseCeiling: + case Algorithm::EltwiseFloor: + case Algorithm::EltwiseElu: + case Algorithm::EltwiseTanh: + case Algorithm::EltwiseSigmoid: + case Algorithm::EltwiseAbs: + case Algorithm::EltwiseSqrt: + case Algorithm::EltwiseSoftRelu: + case Algorithm::EltwiseExp: + case Algorithm::EltwiseClamp: + case Algorithm::EltwiseErf: + case Algorithm::EltwiseLogicalNot: + case Algorithm::EltwisePowerStatic: + case Algorithm::EltwiseSwish: + case Algorithm::EltwiseHswish: + case Algorithm::EltwiseMish: + case Algorithm::EltwiseHsigmoid: + case Algorithm::EltwiseRoundHalfToEven: + case Algorithm::EltwiseRoundHalfAwayFromZero: + case Algorithm::EltwiseSoftSign: + case Algorithm::EltwiseLog: + return 1; + case Algorithm::EltwiseAdd: + case Algorithm::EltwiseSubtract: + case Algorithm::EltwiseMultiply: + case Algorithm::EltwiseDivide: + case Algorithm::EltwiseFloorMod: + case Algorithm::EltwiseMod: + case Algorithm::EltwiseMaximum: + case Algorithm::EltwiseMinimum: + case Algorithm::EltwiseSquaredDifference: + case Algorithm::EltwisePowerDynamic: + case Algorithm::EltwiseEqual: + case Algorithm::EltwiseNotEqual: + case Algorithm::EltwiseGreater: + case Algorithm::EltwiseGreaterEqual: + case Algorithm::EltwiseLess: + case Algorithm::EltwiseLessEqual: + case Algorithm::EltwiseLogicalAnd: + case Algorithm::EltwiseLogicalOr: + case Algorithm::EltwiseLogicalXor: + case Algorithm::EltwiseBitwiseAnd: + case Algorithm::EltwiseBitwiseOr: + case Algorithm::EltwiseBitwiseXor: + case Algorithm::EltwiseBitwiseLeftShift: + case Algorithm::EltwiseBitwiseRightShift: + return 2; + case Algorithm::EltwiseBitwiseNot: + return 1; + case Algorithm::EltwisePrelu: + return 2; + case Algorithm::EltwiseMulAdd: + case Algorithm::EltwiseSelect: + return 3; + default: + OPENVINO_THROW("Unsupported operation for Eltwise node with name `", getName(), "`."); } } @@ -2183,40 +2287,37 @@ void Eltwise::getSupportedDescriptors() { void Eltwise::initSupportedPrimitiveDescriptors() { const auto isBitwise = [](const Algorithm& algorithm) { - return one_of( - algorithm, - Algorithm::EltwiseBitwiseAnd, - Algorithm::EltwiseBitwiseNot, - Algorithm::EltwiseBitwiseOr, - Algorithm::EltwiseBitwiseXor, - Algorithm::EltwiseBitwiseLeftShift, - Algorithm::EltwiseBitwiseRightShift); + return one_of(algorithm, + Algorithm::EltwiseBitwiseAnd, + Algorithm::EltwiseBitwiseNot, + Algorithm::EltwiseBitwiseOr, + Algorithm::EltwiseBitwiseXor, + Algorithm::EltwiseBitwiseLeftShift, + Algorithm::EltwiseBitwiseRightShift); }; - std::vector supportedPrecisions = isBitwise(algorithm) ? - std::vector { - ov::element::u8, - ov::element::i8, - ov::element::u16, - ov::element::i16, - ov::element::i32 - } : std::vector { - ov::element::f32, - ov::element::u8, - ov::element::i8, - ov::element::u16, - ov::element::i16, - ov::element::bf16, - ov::element::f16, - ov::element::i32 - }; + std::vector supportedPrecisions = isBitwise(algorithm) + ? std::vector{ov::element::u8, + ov::element::i8, + ov::element::u16, + ov::element::i16, + ov::element::i32} + : std::vector{ov::element::f32, + ov::element::u8, + ov::element::i8, + ov::element::u16, + ov::element::i16, + ov::element::bf16, + ov::element::f16, + ov::element::i32}; if (!supportedPrimitiveDescriptors.empty()) return; - // if dim rank is greater than the maximum possible, we should use the reference execution -#if defined (OPENVINO_ARCH_ARM64) - bool canUseOptimizedImpl = mayiuse(dnnl::impl::cpu::aarch64::asimd) && (getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK); + // if dim rank is greater than the maximum possible, we should use the reference execution +#if defined(OPENVINO_ARCH_ARM64) + bool canUseOptimizedImpl = + mayiuse(dnnl::impl::cpu::aarch64::asimd) && (getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK); bool canUseOptimizedShapeAgnosticImpl = isDynamicNode() && canUseOptimizedImpl; #else bool canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK; @@ -2261,7 +2362,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { ")"); std::vector inputPrecisions; - for (const auto &prec : getOriginalInputPrecisions()) { + for (const auto& prec : getOriginalInputPrecisions()) { inputPrecisions.push_back(prec); } @@ -2288,31 +2389,32 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } #ifndef OPENVINO_ARCH_ARM64 - implType = canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : - canUseOptimizedImpl ? EltwiseImplType::optimized : EltwiseImplType::reference; + implType = canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic + : canUseOptimizedImpl ? EltwiseImplType::optimized + : EltwiseImplType::reference; if (!hasHardwareSupport(ov::element::bf16)) { bool hasBF16 = false; - for (auto &inPrc : inputPrecisions) + for (auto& inPrc : inputPrecisions) if (inPrc == ov::element::bf16) hasBF16 = true; if (outputPrecision == ov::element::bf16 || hasBF16) OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support BF16 precision on this target."); } -#if defined(OV_CPU_WITH_ACL) +# if defined(OV_CPU_WITH_ACL) const bool useJit = false; -#endif +# endif #elif defined(OPENVINO_ARCH_ARM64) - const bool useJit = canUseOptimizedImpl && - jitIsSupported(this, getAlpha(), getBeta(), getGamma()); + const bool useJit = canUseOptimizedImpl && jitIsSupported(this, getAlpha(), getBeta(), getGamma()); if (!useJit) { canUseOptimizedImpl = false; } - implType = (useJit && canUseOptimizedImpl) ? - (canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : EltwiseImplType::optimized) : - EltwiseImplType::reference; + implType = + (useJit && canUseOptimizedImpl) + ? (canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic : EltwiseImplType::optimized) + : EltwiseImplType::reference; #else OPENVINO_THROW("Unknow CPU architecture"); #endif @@ -2330,66 +2432,74 @@ void Eltwise::initSupportedPrimitiveDescriptors() { const bool useAcl = !useJit; if (useAcl) { - // Use original output precision as a reference point since some eltwise algorithms have non-float inputs (i.e. EltwiseSelect) - ov::element::Type forcedPrec = getOriginalOutputPrecisionAtPort(0) == ov::element::f16 ? ov::element::f16 : ov::element::f32; - // ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one - if (AclEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { - for (size_t i = 0; i < getParentEdges().size(); i++) { - if (!getParentEdgeAt(i)->getParent()->isConstant()) { - if (getOriginalInputPrecisionAtPort(i).size() > forcedPrec.size()) { - forcedPrec = getOriginalInputPrecisionAtPort(i); + // Use original output precision as a reference point since some eltwise algorithms have non-float inputs (i.e. + // EltwiseSelect) + ov::element::Type forcedPrec = + getOriginalOutputPrecisionAtPort(0) == ov::element::f16 ? ov::element::f16 : ov::element::f32; + // ACL implementation supports only identical precisions on inputs/outputs so they are aligned it to highest one + if (AclEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { + for (size_t i = 0; i < getParentEdges().size(); i++) { + if (!getParentEdgeAt(i)->getParent()->isConstant()) { + if (getOriginalInputPrecisionAtPort(i).size() > forcedPrec.size()) { + forcedPrec = getOriginalInputPrecisionAtPort(i); + } } } + if (!forcedPrec.is_real()) { + forcedPrec = ov::element::f32; + } } - if (!forcedPrec.is_real()) { - forcedPrec = ov::element::f32; - } - } - for (size_t i = 0; i < inputPrecisions.size(); i++) { - inputPrecisions[i] = filterPrecision(inputPrecisions[i], forcedPrec); - } - outputPrecision = filterPrecision(outputPrecision, forcedPrec); - } else { -#endif -#if defined(OV_CPU_WITH_SHL) - if (ShlEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { - // SHL implementation supports only identical precisions on inputs/outputs and only FP32 for now - const ov::element::Type forcedPrec = ov::element::f32; for (size_t i = 0; i < inputPrecisions.size(); i++) { - inputPrecisions[i] = forcedPrec; + inputPrecisions[i] = filterPrecision(inputPrecisions[i], forcedPrec); } - outputPrecision = forcedPrec; + outputPrecision = filterPrecision(outputPrecision, forcedPrec); } else { #endif - auto filterPrecision = [&](const ov::element::Type& prc) { - if (implType == EltwiseImplType::reference) { - if (isBitwise(algorithm)) { - if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) { - OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support ", prc, " precision."); - } - return prc; - } - return ov::element::f32; - } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) { - if (prc == ov::element::u32 || prc == ov::element::i64 || prc == ov::element::u64) { - return ov::element::i32; - } else if (prc == ov::element::f64) { - return ov::element::f32; - } else { - OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support ", prc, " precision."); +#if defined(OV_CPU_WITH_SHL) + if (ShlEltwiseExecutor::isEltwiseAlgorithmSupported(getAlgorithm())) { + // SHL implementation supports only identical precisions on inputs/outputs and only FP32 for now + const ov::element::Type forcedPrec = ov::element::f32; + for (size_t i = 0; i < inputPrecisions.size(); i++) { + inputPrecisions[i] = forcedPrec; } + outputPrecision = forcedPrec; } else { - return prc; - } - }; +#endif + auto filterPrecision = [&](const ov::element::Type& prc) { + if (implType == EltwiseImplType::reference) { + if (isBitwise(algorithm)) { + if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == + supportedPrecisions.end()) { + OPENVINO_THROW("Eltwise node with name `", + getName(), + "` doesn't support ", + prc, + " precision."); + } + return prc; + } + return ov::element::f32; + } else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == + supportedPrecisions.end()) { + if (prc == ov::element::u32 || prc == ov::element::i64 || prc == ov::element::u64) { + return ov::element::i32; + } else if (prc == ov::element::f64) { + return ov::element::f32; + } else { + OPENVINO_THROW("Eltwise node with name `", getName(), "` doesn't support ", prc, " precision."); + } + } else { + return prc; + } + }; - for (size_t i = 0; i < inputPrecisions.size(); i++) { - inputPrecisions[i] = filterPrecision(inputPrecisions[i]); - } - outputPrecision = filterPrecision(outputPrecision); + for (size_t i = 0; i < inputPrecisions.size(); i++) { + inputPrecisions[i] = filterPrecision(inputPrecisions[i]); + } + outputPrecision = filterPrecision(outputPrecision); #if defined(OV_CPU_WITH_SHL) - } + } #endif #if defined(OV_CPU_WITH_ACL) } @@ -2398,22 +2508,19 @@ void Eltwise::initSupportedPrimitiveDescriptors() { // TODO: delete after new LPT (ngraph based) is merged // WA is needed to handle bug in LPT that produces wrong precision after average pooling (I8/U8 instead of FP32) if ((getAlgorithm() == Algorithm::EltwiseMulAdd || getAlgorithm() == Algorithm::EltwisePowerStatic) && - (inputPrecisions[0] == ov::element::u8 || inputPrecisions[0] == ov::element::i8)) { + (inputPrecisions[0] == ov::element::u8 || inputPrecisions[0] == ov::element::i8)) { auto parentNode = getParentEdgeAt(0)->getParent(); if (getParentEdgeAt(0)->getParent()->getAlgorithm() == Algorithm::PoolingAvg) { inputPrecisions[0] = ov::element::f32; } } - enum LayoutType { - Planar, - ChannelsFirst, - Blocked - }; + enum LayoutType { Planar, ChannelsFirst, Blocked }; - auto initDesc = [&] (LayoutType lt, const bool useEltwiseExecutor = false, const bool useJit = false) -> NodeDesc { - auto createMemoryDesc = [lt](const Shape &shape, ov::element::Type prc, size_t offset) -> std::shared_ptr { - const auto &dims = shape.getDims(); + auto initDesc = [&](LayoutType lt, const bool useEltwiseExecutor = false, const bool useJit = false) -> NodeDesc { + auto createMemoryDesc = + [lt](const Shape& shape, ov::element::Type prc, size_t offset) -> std::shared_ptr { + const auto& dims = shape.getDims(); if (lt == ChannelsFirst && shape.getRank() != 1) { auto ndims = shape.getRank(); VectorDims order(ndims); @@ -2429,10 +2536,11 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } return std::make_shared(prc, shape, blocks, order, offset); - // TODO: need investigate - // bad accuracy for shape {1, 1, 4, 11}, {2, 5, 1, 1} - // same for disabled collapse dims - } else if (lt == Blocked && shape.getRank() != 1 && (shape.getMinDims()[1] != Shape::UNDEFINED_DIM && shape.getMinDims()[1] > 1)) { + // TODO: need investigate + // bad accuracy for shape {1, 1, 4, 11}, {2, 5, 1, 1} + // same for disabled collapse dims + } else if (lt == Blocked && shape.getRank() != 1 && + (shape.getMinDims()[1] != Shape::UNDEFINED_DIM && shape.getMinDims()[1] > 1)) { size_t blockSize = dnnl::impl::cpu::x64::mayiuse(x64::avx512_core) ? 16 : 8; VectorDims blocks = dims; VectorDims order(blocks.size()); @@ -2463,9 +2571,9 @@ void Eltwise::initSupportedPrimitiveDescriptors() { portConfig.inPlace((!i && canBeInPlace() && inputPrecisions[i] == outputPrecision) ? 0 : -1); portConfig.constant(false); - const auto &srcShape = getInputShapeAtPort(i); + const auto& srcShape = getInputShapeAtPort(i); if (!isDynamicNode() && srcShape.getDims()[0] == 1) { - inputMask.reset(0); // accepts any stride on the batch axis + inputMask.reset(0); // accepts any stride on the batch axis } portConfig.setMemDesc(createMemoryDesc(srcShape, inputPrecisions[i], offset), inputMask); @@ -2476,10 +2584,10 @@ void Eltwise::initSupportedPrimitiveDescriptors() { portConfig.inPlace(-1); portConfig.constant(false); - const auto &dstShape = getOutputShapeAtPort(0); + const auto& dstShape = getOutputShapeAtPort(0); BlockedMemoryDesc::CmpMask outputMask = BlockedMemoryDesc::SKIP_OFFSET_MASK; if (!isDynamicNode() && dstShape.getDims()[0] == 1) { - outputMask.reset(0); // accepts any stride on the batch axis + outputMask.reset(0); // accepts any stride on the batch axis } portConfig.setMemDesc(createMemoryDesc(dstShape, outputPrecision, offset), outputMask); @@ -2487,13 +2595,13 @@ void Eltwise::initSupportedPrimitiveDescriptors() { if (useEltwiseExecutor || useJit) { impl_desc_type impl_type; - #if defined (OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_ARM64) if (useJit) { impl_type = impl_desc_type::jit_asimd; } - #else +#else impl_type = impl_desc_type::undef; - #endif +#endif std::vector srcMemoryDescs; for (size_t i = 0; i < config.inConfs.size(); i++) { @@ -2504,20 +2612,23 @@ void Eltwise::initSupportedPrimitiveDescriptors() { dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); } - auto factory = std::make_shared(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getImplPriority())); + auto factory = + std::make_shared(eltwiseAttrs, + srcMemoryDescs, + dstMemoryDescs, + std::make_shared(context, getImplPriority())); return {config, impl_type, !factory->isEmpty() ? factory : nullptr}; } else { impl_desc_type impl_type = impl_desc_type::ref; if (canUseOptimizedImpl) { - #if defined (OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_ARM64) if (mayiuse(dnnl::impl::cpu::aarch64::asimd)) { impl_type = impl_desc_type::jit_asimd; } else { OPENVINO_THROW("not supported architecture"); } - #else +#else if (mayiuse(x64::avx512_core)) { impl_type = impl_desc_type::jit_avx512; } else if (mayiuse(x64::avx2)) { @@ -2525,7 +2636,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() { } else if (mayiuse(x64::sse41)) { impl_type = impl_desc_type::jit_sse42; } - #endif +#endif } return {config, impl_type}; @@ -2534,10 +2645,11 @@ void Eltwise::initSupportedPrimitiveDescriptors() { bool isChannelsFirstApplicable = one_of(getOutputShapeAtPort(0).getRank(), 1u, 2u, 3u, 4u, 5u); for (size_t i = 0; i < getParentEdges().size(); i++) { - isChannelsFirstApplicable = isChannelsFirstApplicable && one_of(getInputShapeAtPort(i).getRank(), 1u, 2u, 3u, 4u, 5u); - isChannelsFirstApplicable = isChannelsFirstApplicable && implication(getInputShapeAtPort(i).getRank() != 1, - getOutputShapeAtPort(0).getRank() == - getInputShapeAtPort(i).getRank()); + isChannelsFirstApplicable = + isChannelsFirstApplicable && one_of(getInputShapeAtPort(i).getRank(), 1u, 2u, 3u, 4u, 5u); + isChannelsFirstApplicable = isChannelsFirstApplicable && + implication(getInputShapeAtPort(i).getRank() != 1, + getOutputShapeAtPort(0).getRank() == getInputShapeAtPort(i).getRank()); } #if defined(OPENVINO_ARCH_ARM64) @@ -2547,13 +2659,14 @@ void Eltwise::initSupportedPrimitiveDescriptors() { #endif for (size_t i = 0; i < getParentEdges().size(); i++) { - const auto &inShape = getInputShapeAtPort(i); + const auto& inShape = getInputShapeAtPort(i); isBlockedApplicable = isBlockedApplicable && one_of(inShape.getRank(), 1u, 3u, 4u, 5u); - isBlockedApplicable = isBlockedApplicable && implication(inShape.getRank() != 1, - getOutputShapeAtPort(0).getRank() == - inShape.getRank()); + isBlockedApplicable = + isBlockedApplicable && + implication(inShape.getRank() != 1, getOutputShapeAtPort(0).getRank() == inShape.getRank()); if (isDynamicNode() && inShape.getRank() != 1) - isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1; + isBlockedApplicable = + isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1; } inputNum = getParentEdges().size(); @@ -2561,28 +2674,29 @@ void Eltwise::initSupportedPrimitiveDescriptors() { #if defined(OV_CPU_WITH_ACL) if (useAcl || useJit) { - eltwiseAttrs = {algorithm, alpha, beta, gamma}; + eltwiseAttrs = {algorithm, alpha, beta, gamma}; - auto addDesc = [&initDesc, &useJit](std::vector& supportedPrimitiveDescriptors, const LayoutType layoutType) { - auto nodeDesc = initDesc(layoutType, !useJit, useJit); - if (nodeDesc.getExecutorFactory()) - supportedPrimitiveDescriptors.emplace_back(nodeDesc); - }; + auto addDesc = [&initDesc, &useJit](std::vector& supportedPrimitiveDescriptors, + const LayoutType layoutType) { + auto nodeDesc = initDesc(layoutType, !useJit, useJit); + if (nodeDesc.getExecutorFactory()) + supportedPrimitiveDescriptors.emplace_back(nodeDesc); + }; - // @todo should be handled in scope of selectPreferPrimitiveDescriptor - if (context->getConfig().modelType == Config::ModelType::CNN) { - if (isChannelsFirstApplicable) - addDesc(supportedPrimitiveDescriptors, ChannelsFirst); - addDesc(supportedPrimitiveDescriptors, Planar); - } else { - addDesc(supportedPrimitiveDescriptors, Planar); - if (isChannelsFirstApplicable) - addDesc(supportedPrimitiveDescriptors, ChannelsFirst); - } + // @todo should be handled in scope of selectPreferPrimitiveDescriptor + if (context->getConfig().modelType == Config::ModelType::CNN) { + if (isChannelsFirstApplicable) + addDesc(supportedPrimitiveDescriptors, ChannelsFirst); + addDesc(supportedPrimitiveDescriptors, Planar); + } else { + addDesc(supportedPrimitiveDescriptors, Planar); + if (isChannelsFirstApplicable) + addDesc(supportedPrimitiveDescriptors, ChannelsFirst); + } - canUseEltwiseExecPtr = !supportedPrimitiveDescriptors.empty() && !useJit; - if (!supportedPrimitiveDescriptors.empty()) - return; + canUseEltwiseExecPtr = !supportedPrimitiveDescriptors.empty() && !useJit; + if (!supportedPrimitiveDescriptors.empty()) + return; } #endif @@ -2652,15 +2766,18 @@ void Eltwise::prepareParams() { dstMemoryDescs.push_back(getDstMemoryAtPort(0)->getDescPtr()); auto selectedPD = getSelectedPrimitiveDescriptor(); - eltwiseExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(eltwiseAttrs, srcMemoryDescs, dstMemoryDescs, {}); + eltwiseExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(eltwiseAttrs, + srcMemoryDescs, + dstMemoryDescs, + {}); selectedPD->setImplementationType(eltwiseExecPtr->getImplType()); return; } auto outBlockingDesc = getChildEdgeAt(0)->getMemory().getDescWithType(); - const auto &outOrder = outBlockingDesc->getOrder(); - const auto ¤tOutBlkDims = outBlockingDesc->getBlockDims(); + const auto& outOrder = outBlockingDesc->getOrder(); + const auto& currentOutBlkDims = outBlockingDesc->getBlockDims(); size_t input_size = std::max(static_cast(EltwiseJitExecutor::optimalTensorRank), currentOutBlkDims.size()); @@ -2679,13 +2796,16 @@ void Eltwise::prepareParams() { size_t inRank = currentInBlkDims[i].size(); // WA to normalize blocked and planar layouts - const auto &inOrder = inBlockingDesc->getOrder(); + const auto& inOrder = inBlockingDesc->getOrder(); size_t startOff = outOrder.size() != outBlockingDesc->getShape().getRank() && - outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] ? 1 : 0; + outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] + ? 1 + : 0; // WA to handle nspc layout with 1D tensors if (1 == inRank) { - if (outRank > 2 && 1 == outOrder.back()) startOff = 1; + if (outRank > 2 && 1 == outOrder.back()) + startOff = 1; } for (size_t j = 0; j < inRank; j++) { @@ -2718,14 +2838,18 @@ void Eltwise::prepareParams() { if (!canSkipSearchInCache) { EltwiseData thisOp{getAlgorithm(), getOneDnnAlgorithm(), getAlpha(), getBeta(), getGamma()}; - EltwiseKey key = {{thisOp}, {getType()}, currentOutBlkDims, outOrder, dims_in, inpPrc, outPrc, dnnl::post_ops(), implType}; + EltwiseKey key = + {{thisOp}, {getType()}, currentOutBlkDims, outOrder, dims_in, inpPrc, outPrc, dnnl::post_ops(), implType}; fqDataPtrs.clear(); - for (const auto &node : fusedWith) { + for (const auto& node : fusedWith) { key.ops_list.push_back(node->getType()); if (node->getType() == Type::Eltwise) { if (auto eltwise = std::dynamic_pointer_cast(node)) { - key.eltwise_data.push_back({eltwise->getAlgorithm(), eltwise->getOneDnnAlgorithm(), eltwise->getAlpha(), - eltwise->getBeta(), eltwise->getGamma()}); + key.eltwise_data.push_back({eltwise->getAlgorithm(), + eltwise->getOneDnnAlgorithm(), + eltwise->getAlpha(), + eltwise->getBeta(), + eltwise->getGamma()}); } } else if (node->getType() == Type::FakeQuantize) { node->appendPostOps(key.postOps, {}, fqDataPtrs); @@ -2745,9 +2869,9 @@ void Eltwise::prepareParams() { // update execParams for shape agnostic kernel if (implType == EltwiseImplType::optimizedShapeAgnostic) { - auto &outDims = execParams.outDims; - auto &inOffsets = execParams.inOffsets; - auto &outOffsets = execParams.outOffsets; + auto& outDims = execParams.outDims; + auto& inOffsets = execParams.inOffsets; + auto& outOffsets = execParams.outOffsets; // outDims recalculation outDims.resize(dims_in[0].size(), 1); @@ -2805,7 +2929,8 @@ void Eltwise::selectOptimalPrimitiveDescriptor() { void Eltwise::execute(dnnl::stream strm) { if (execPtr) { jit_eltwise_call_args_ptrs args_ptrs = {}; - VectorDims dims_out = implType == EltwiseImplType::optimizedShapeAgnostic ? execParams.outDims : execPtr->getOutDims(); + VectorDims dims_out = + implType == EltwiseImplType::optimizedShapeAgnostic ? execParams.outDims : execPtr->getOutDims(); for (size_t i = 0; i < memPtrs.size() - 1; i++) args_ptrs.src_ptr[i] = memPtrs[i]->getDataAs() + start_offset_in[i]; args_ptrs.dst_ptr = memPtrs.back()->getDataAs() + start_offset_out; @@ -2873,15 +2998,14 @@ void Eltwise::fuseInto(NodePtr& parentNode) { getAlgorithm() == Algorithm::EltwiseAdd && dimsEqualWeak(getInputShapeAtPort(0).getDims(), getInputShapeAtPort(1).getDims()) && !getParentEdgeAt(0)->getParent()->isConstant() && !getParentEdgeAt(1)->getParent()->isConstant(); - if ((scales.empty() && shifts.empty()) && - !specialConvolutionAddFusing && + if ((scales.empty() && shifts.empty()) && !specialConvolutionAddFusing && canBePerformedAsScaleShift(parentNode.get())) { std::tie(scales, shifts) = getScalesAndShifts(parentNode.get()); } Node::fuseInto(parentNode); } -void Eltwise::appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void Eltwise::appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem) { if (!memPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::f32, {data.size()}); memPtr = std::make_shared(getEngine(), memoryDesc, data.data()); @@ -2889,12 +3013,15 @@ void Eltwise::appendMemory(const std::vector &data, MemoryPtr &memPtr, st } } -void Eltwise::appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void Eltwise::appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem) { postOpsMem.push_back(data.data()); } template -void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis) { +void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis) { const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' "; if (getOneDnnAlgorithm() != dnnl::algorithm::undef) { @@ -2920,7 +3047,8 @@ void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDim case dnnl::algorithm::eltwise_round_half_away_from_zero: ops.append_eltwise(getOneDnnAlgorithm(), getAlpha(), getBeta()); break; - default: OPENVINO_THROW(errorPrefix, "as post operation is not supported"); + default: + OPENVINO_THROW(errorPrefix, "as post operation is not supported"); } } else { // per-tensor EltwisePowerStatic can be implemented with more well-supported eltwise postOps @@ -2938,7 +3066,8 @@ void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDim const auto chIdx = postOpDims.size() > 1 ? channelAxis : 0; channelSize = postOpDims[chIdx]; } - // since legacy depthwise post ops mechanism requires broadcasted data we need to reinitilize it in case of changed shape + // since legacy depthwise post ops mechanism requires broadcasted data we need to reinitilize it in case of + // changed shape if (depthwiseData.empty() || depthwiseDataSize != 2 * channelSize) { depthwiseData.clear(); depthwiseMemory.reset(); @@ -2995,7 +3124,10 @@ void Eltwise::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDim } } -void Eltwise::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis) { +void Eltwise::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis) { std::vector postOpsMemPtrs; appendPostOpsImpl(ops, postOpDims, postOpsMemPtrs, channelAxis); @@ -3006,11 +3138,17 @@ void Eltwise::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, s } } -void Eltwise::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis) { +void Eltwise::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis) { appendPostOpsImpl(ops, postOpDims, postOpsMem, channelAxis); } -bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastPostOp, dnnl::memory::data_type outDataType, bool allowBinary) { +bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, + bool isLastPostOp, + dnnl::memory::data_type outDataType, + bool allowBinary) { const std::string errorPrefix = "Appending Eltwise node with name '" + getName() + "' as binary post op "; if (getOneDnnAlgorithm() != dnnl::algorithm::undef) { @@ -3039,7 +3177,8 @@ bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastP // call dnnlpoc's specialized API to generate optimized postOps sequence dnnlpoc.appendLinear({getAlpha()}, {getBeta()}, isLastPostOp); break; - default: OPENVINO_THROW(errorPrefix, "as post operation is not supported"); + default: + OPENVINO_THROW(errorPrefix, "as post operation is not supported"); } } else { switch (getAlgorithm()) { @@ -3054,9 +3193,9 @@ bool Eltwise::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastP case Algorithm::EltwisePowerStatic: if (beta != 1.0f && gamma != 0.0f) { return dnnlpoc.appendLinear(scales, shifts, isLastPostOp, allowBinary); - } else if (beta != 1.0f) {// Multiply if has scales + } else if (beta != 1.0f) { // Multiply if has scales return dnnlpoc.appendScale(scales, isLastPostOp, allowBinary); - } else if (gamma != 0.0f) {// Add only if has shifts + } else if (gamma != 0.0f) { // Add only if has shifts return dnnlpoc.appendShift(shifts, allowBinary); } break; @@ -3103,16 +3242,17 @@ bool Eltwise::canFuseParent(const NodePtr& parentNode) const { bool Eltwise::canFuse(const NodePtr& node) const { auto isIntegerComputeSupported = [](const Node* node) { - if (!one_of(node->getAlgorithm(), Algorithm::EltwiseAdd, - Algorithm::EltwiseMultiply, - Algorithm::EltwiseMulAdd, - Algorithm::EltwiseSubtract, - Algorithm::EltwiseDivide, - Algorithm::EltwiseSquaredDifference)) { + if (!one_of(node->getAlgorithm(), + Algorithm::EltwiseAdd, + Algorithm::EltwiseMultiply, + Algorithm::EltwiseMulAdd, + Algorithm::EltwiseSubtract, + Algorithm::EltwiseDivide, + Algorithm::EltwiseSquaredDifference)) { return false; } - for (const auto &originalInputPrecision : node->getOriginalInputPrecisions()) { + for (const auto& originalInputPrecision : node->getOriginalInputPrecisions()) { if (originalInputPrecision != ov::element::i32) { return false; } @@ -3121,7 +3261,7 @@ bool Eltwise::canFuse(const NodePtr& node) const { return true; }; -#if defined (OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_ARM64) if (!mayiuse(dnnl::impl::cpu::aarch64::asimd) || (getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK)) return false; @@ -3129,10 +3269,8 @@ bool Eltwise::canFuse(const NodePtr& node) const { return false; } const auto eltwise = dynamic_cast(node.get()); - if ((eltwise == nullptr) || (!jitIsSupported(eltwise, - eltwise->getAlpha(), - eltwise->getBeta(), - eltwise->getGamma()))) { + if ((eltwise == nullptr) || + (!jitIsSupported(eltwise, eltwise->getAlpha(), eltwise->getBeta(), eltwise->getGamma()))) { return false; } #else @@ -3170,29 +3308,30 @@ bool Eltwise::canFuse(const NodePtr& node) const { return false; if (node->getType() == Type::Eltwise) { - // [WA] Since execution precision change from I32 to FP32 for arithmetic operations may lead to incorrect results - // we disable fusing cases which may lead to invalid precision conversions inside the kernel - // [TODO] We need to rewrite support for different precisions at all to avoid implicit conversions to FP32 - // (all should be handled via explicit convert operations) + // [WA] Since execution precision change from I32 to FP32 for arithmetic operations may lead to incorrect + // results we disable fusing cases which may lead to invalid precision conversions inside the kernel [TODO] We + // need to rewrite support for different precisions at all to avoid implicit conversions to FP32 (all should be + // handled via explicit convert operations) bool isIntegerFusingNode = isIntegerComputeSupported(node.get()); - if ((isIntegerNode && !isIntegerFusingNode) || - (!isIntegerNode && isIntegerFusingNode)) { + if ((isIntegerNode && !isIntegerFusingNode) || (!isIntegerNode && isIntegerFusingNode)) { return false; } if (node->getParentEdgeAt(0)->getParent().get() != this) { - // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port. - if (one_of(node->getAlgorithm(), Algorithm::EltwiseSubtract, - Algorithm::EltwiseDivide, - Algorithm::EltwiseFloorMod, - Algorithm::EltwiseMod, - Algorithm::EltwisePowerDynamic, - Algorithm::EltwiseGreater, - Algorithm::EltwiseGreaterEqual, - Algorithm::EltwiseLess, - Algorithm::EltwiseLessEqual, - Algorithm::EltwiseMulAdd, - Algorithm::EltwiseSelect)) { + // Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for + // 0-th port. + if (one_of(node->getAlgorithm(), + Algorithm::EltwiseSubtract, + Algorithm::EltwiseDivide, + Algorithm::EltwiseFloorMod, + Algorithm::EltwiseMod, + Algorithm::EltwisePowerDynamic, + Algorithm::EltwiseGreater, + Algorithm::EltwiseGreaterEqual, + Algorithm::EltwiseLess, + Algorithm::EltwiseLessEqual, + Algorithm::EltwiseMulAdd, + Algorithm::EltwiseSelect)) { return false; } @@ -3205,7 +3344,8 @@ bool Eltwise::canFuse(const NodePtr& node) const { } } - // We can use optimized execution with fusions only in cases when dim rank is less or equal to the maximum possible + // We can use optimized execution with fusions only in cases when dim rank is less or equal to the maximum + // possible if (node->getInputShapeAtPort(0).getRank() > MAX_ELTWISE_DIM_RANK) return false; @@ -3224,13 +3364,15 @@ ov::element::Type Eltwise::getRuntimePrecision() const { // Don't take bias precision into account for (size_t i = 0; i < getParentEdges().size(); i++) { auto parentEdge = getParentEdgeAt(i); - if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && !parentEdge->getParent()->isConstant()) { - inputPrecisions.emplace_back(DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); + if (parentEdge && parentEdge->getStatus() == Edge::Status::Validated && + !parentEdge->getParent()->isConstant()) { + inputPrecisions.emplace_back( + DnnlExtensionUtils::DataTypeToElementType((parentEdge->getMemoryPtr()->getDataType()))); } } return getMaxPrecision(inputPrecisions); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.h b/src/plugins/intel_cpu/src/nodes/eltwise.h index 6013ce732ee5fc..d0ca94e08824c8 100644 --- a/src/plugins/intel_cpu/src/nodes/eltwise.h +++ b/src/plugins/intel_cpu/src/nodes/eltwise.h @@ -5,17 +5,18 @@ #pragma once #include + +#include #include #include -#include #include "dnnl_postops_composer_legacy.h" -#include "nodes/executors/eltwise.hpp" #include "executors/eltwise_list.hpp" +#include "nodes/executors/eltwise.hpp" #include "nodes/kernels/jit_eltwise_call_args_ptrs.hpp" #if defined(OPENVINO_ARCH_ARM64) -#include "kernels/aarch64/jit_uni_eltwise_generic.hpp" +# include "kernels/aarch64/jit_uni_eltwise_generic.hpp" #endif namespace ov { @@ -68,18 +69,14 @@ struct jit_uni_eltwise_kernel { #endif -enum class EltwiseImplType { - reference = 0, - optimized = 1, - optimizedShapeAgnostic = 2 -}; +enum class EltwiseImplType { reference = 0, optimized = 1, optimizedShapeAgnostic = 2 }; class Eltwise : public Node { public: class IEltwiseExecutor { public: IEltwiseExecutor() = default; - virtual void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) = 0; + virtual void exec(const jit_eltwise_call_args_ptrs& args_ptrs, const VectorDims& dims_out) = 0; virtual size_t getBatchDimIdx() const = 0; virtual const VectorDims& getOutDims() const = 0; virtual ~IEltwiseExecutor() = default; @@ -98,22 +95,45 @@ class Eltwise : public Node { bool canBeInPlace() const override; bool canFuseParent(const NodePtr& parentNode) const; bool canFuse(const NodePtr& node) const override; - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis = 1) override; - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis = 1) override; - bool appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastPostOp, dnnl::memory::data_type outDataType, bool allowBinary = true); + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis = 1) override; + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1) override; + bool appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, + bool isLastPostOp, + dnnl::memory::data_type outDataType, + bool allowBinary = true); void fuseInto(NodePtr& parentNode) override; ov::element::Type getRuntimePrecision() const override; - float getAlpha() const { return alpha; } - float getBeta() const { return beta; } - float getGamma() const { return gamma; } - const std::vector& getScales() const { return scales; } - const std::vector& getShifts() const { return shifts; } + float getAlpha() const { + return alpha; + } + float getBeta() const { + return beta; + } + float getGamma() const { + return gamma; + } + const std::vector& getScales() const { + return scales; + } + const std::vector& getShifts() const { + return shifts; + } - dnnl::algorithm getOneDnnAlgorithm() const { return onednnAlgorithm; } + dnnl::algorithm getOneDnnAlgorithm() const { + return onednnAlgorithm; + } bool isWithBroadcast(); - bool isSpecialConvolutionAddFusing() const { return specialConvolutionAddFusing; } + bool isSpecialConvolutionAddFusing() const { + return specialConvolutionAddFusing; + } bool needPrepareParams() const override; void prepareParams() override; @@ -127,7 +147,9 @@ class Eltwise : public Node { Undefined, }; - BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; } + BroadcastingPolicy getBroadcastingPolicy() const { + return broadcastingPolicy; + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -181,10 +203,13 @@ class Eltwise : public Node { size_t getOpInputsNum() const; template - void appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis = 1); + void appendPostOpsImpl(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1); - void appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem); - void appendMemory(const std::vector &data, MemoryPtr &memPtr, std::vector& postOpsMem); + void appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem); + void appendMemory(const std::vector& data, MemoryPtr& memPtr, std::vector& postOpsMem); bool canUseEltwiseExecPtr = false; EltwiseAttrs eltwiseAttrs; @@ -201,6 +226,6 @@ class eltwise_precision_helper { static std::set> get_supported_precisions(const Algorithm& algo); }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp index 8b144e90c865bc..2dcb93f9fc6c1b 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag.cpp @@ -18,10 +18,10 @@ namespace intel_cpu { namespace node { EmbeddingBag::EmbeddingBag(const std::shared_ptr& op, - size_t requiredInputNum, - size_t indicesIdx, - size_t perSampleWeightsIdx, - size_t defaultIndexIdx) + size_t requiredInputNum, + size_t indicesIdx, + size_t perSampleWeightsIdx, + size_t defaultIndexIdx) : INDICES_IDX(indicesIdx), PER_SAMPLE_WEIGHTS_IDX(perSampleWeightsIdx), DEFAULT_INDEX_IDX(defaultIndexIdx) { @@ -47,9 +47,9 @@ void EmbeddingBag::prepareParams(const VectorDims& indexStaticShape) { template void EmbeddingBag::processData(const T* srcData, - const T* weightsData, - const VectorDims& inDataDims, - const MemoryPtr& outMemory) { + const T* weightsData, + const VectorDims& inDataDims, + const MemoryPtr& outMemory) { std::string msgPrefix = std::string("Node EmbeddingBag with name '") + _layerName + "' "; initFromInputs(); @@ -127,10 +127,10 @@ void EmbeddingBag::processData(const T* srcData, } void EmbeddingBag::execute(const uint8_t* srcData, - const uint8_t* weightsData, - const ov::element::Type& srcPrc, - const VectorDims& inDims, - const MemoryPtr& outMemory) { + const uint8_t* weightsData, + const ov::element::Type& srcPrc, + const VectorDims& inDims, + const MemoryPtr& outMemory) { switch (srcPrc) { case ov::element::f32: { return processData::value_type>( @@ -157,8 +157,7 @@ void EmbeddingBag::execute(const uint8_t* srcData, outMemory); } default: { - OPENVINO_THROW("EmbeddingBag layer does not support precision '" + std::string(srcPrc.get_type_name()) + - "'"); + OPENVINO_THROW("EmbeddingBag layer does not support precision '" + std::string(srcPrc.get_type_name()) + "'"); } } } diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag.h b/src/plugins/intel_cpu/src/nodes/embedding_bag.h index 28c8666233fa1a..d804ea06c2b317 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag.h @@ -13,32 +13,32 @@ namespace node { class EmbeddingBag { public: enum class Reduction { SUM, MEAN }; - EmbeddingBag( - const std::shared_ptr&, - size_t requiredInputsNum, - size_t indicesIdx, - size_t perSampleWeightsIdx, - size_t defaultIndexIdx); - - void execute(const uint8_t* srcData, const uint8_t* weightsData, const ov::element::Type &srcPrc, - const VectorDims& inDims, const MemoryPtr& outMemory); + EmbeddingBag(const std::shared_ptr&, + size_t requiredInputsNum, + size_t indicesIdx, + size_t perSampleWeightsIdx, + size_t defaultIndexIdx); + + void execute(const uint8_t* srcData, + const uint8_t* weightsData, + const ov::element::Type& srcPrc, + const VectorDims& inDims, + const MemoryPtr& outMemory); ~EmbeddingBag() = default; protected: virtual void initFromInputs() = 0; - virtual void getIndices( - size_t embIndex, - const int*& indicesRef, - size_t& size, - int& weightsIdx, - bool& withWeights) = 0; + virtual void getIndices(size_t embIndex, + const int*& indicesRef, + size_t& size, + int& weightsIdx, + bool& withWeights) = 0; void prepareParams(const VectorDims& indexStaticShape); - template - void processData(const T* srcData, const T* weightsData, - const VectorDims& inDataDims, const MemoryPtr& outMemory); + template + void processData(const T* srcData, const T* weightsData, const VectorDims& inDataDims, const MemoryPtr& outMemory); const size_t EMB_TABLE_IDX = 0lu; const size_t INDICES_IDX; @@ -51,6 +51,6 @@ class EmbeddingBag { std::string _layerName; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp index b5fbaee982808d..8da557a823a948 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.cpp @@ -2,24 +2,27 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "embedding_bag_offsets.h" + #include -#include #include -#include "embedding_bag_offsets.h" -#include "openvino/op/embeddingbag_offsets_sum.hpp" -#include "openvino/op/embeddingbag_offsets.hpp" +#include +#include "openvino/op/embeddingbag_offsets.hpp" +#include "openvino/op/embeddingbag_offsets_sum.hpp" namespace ov { namespace intel_cpu { namespace node { -bool EmbeddingBagOffset::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool EmbeddingBagOffset::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto embBagOffsetSumOp = ov::as_type_ptr(op); const auto embBagOffsetOp = ov::as_type_ptr(op); if (!embBagOffsetSumOp && !embBagOffsetOp) { - errorMessage = "Node is not an instance of the v3::EmbeddingBagOffsetsSum or v15::EmbeddingBagOffsets operation."; + errorMessage = + "Node is not an instance of the v3::EmbeddingBagOffsetsSum or v15::EmbeddingBagOffsets operation."; return false; } } catch (...) { @@ -46,7 +49,8 @@ EmbeddingBagOffset::EmbeddingBagOffset(const std::shared_ptr& op, cons _reduction = Reduction::MEAN; break; default: - THROW_CPU_NODE_ERR("EmbeddingBagOffsets does not support reduction mode: ", ov::as_string(offsets_op->get_reduction())); + THROW_CPU_NODE_ERR("EmbeddingBagOffsets does not support reduction mode: ", + ov::as_string(offsets_op->get_reduction())); } } if (getInputShapeAtPort(INDICES_IDX).getRank() != 1ul) @@ -61,8 +65,10 @@ void EmbeddingBagOffset::initSupportedPrimitiveDescriptors() { return; std::string logPrefix = std::string("Layer EmbeddingBag with name '") + _layerName + "' "; - static const std::set supportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set supportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; auto inDataPrecision = getOriginalInputPrecisionAtPort(EMB_TABLE_IDX); if (one_of(inDataPrecision, ov::element::bf16, ov::element::f16)) @@ -71,8 +77,10 @@ void EmbeddingBagOffset::initSupportedPrimitiveDescriptors() { if (supportedPrecisions.find(inDataPrecision) == supportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } else { - static const std::set defaultSupportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set defaultSupportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; if (defaultSupportedPrecisions.find(inDataPrecision) == defaultSupportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } @@ -103,7 +111,11 @@ void EmbeddingBagOffset::initFromInputs() { } } -void EmbeddingBagOffset::getIndices(size_t embIndex, const int*& indices, size_t& size, int& weightsIdx, bool& withWeight) { +void EmbeddingBagOffset::getIndices(size_t embIndex, + const int*& indices, + size_t& size, + int& weightsIdx, + bool& withWeight) { if (static_cast(embIndex) >= _offsetsLen) { OPENVINO_THROW("Invalid embedding bag index."); } @@ -145,20 +157,23 @@ bool EmbeddingBagOffset::isExecutable() const { } void EmbeddingBagOffset::execute(dnnl::stream strm) { - const auto *srcData = getSrcDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(0); const uint8_t* weightsData = nullptr; if (_withWeights) weightsData = getSrcDataAtPortAs(PER_SAMPLE_WEIGHTS_IDX); - const auto &inputMem = getParentEdgeAt(0)->getMemory(); - EmbeddingBag::execute(srcData, weightsData, inputMem.getDesc().getPrecision(), - inputMem.getStaticDims(), getDstMemoryAtPort(0)); + const auto& inputMem = getParentEdgeAt(0)->getMemory(); + EmbeddingBag::execute(srcData, + weightsData, + inputMem.getDesc().getPrecision(), + inputMem.getStaticDims(), + getDstMemoryAtPort(0)); } bool EmbeddingBagOffset::created() const { return getType() == Type::EmbeddingBagOffsets; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h index a31b518e7891a9..f8a28152a26642 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_offsets.h @@ -15,7 +15,7 @@ class EmbeddingBagOffset : public Node, public EmbeddingBag { public: EmbeddingBagOffset(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -41,6 +41,6 @@ class EmbeddingBagOffset : public Node, public EmbeddingBag { size_t _offsetsLen = 0; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp index fd2e0b6141f1fc..c1a06835a67af3 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.cpp @@ -2,23 +2,27 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "embedding_bag_packed.h" + #include -#include #include -#include "embedding_bag_packed.h" -#include "openvino/op/embeddingbag_packedsum.hpp" +#include + #include "openvino/op/embeddingbag_packed.hpp" +#include "openvino/op/embeddingbag_packedsum.hpp" namespace ov { namespace intel_cpu { namespace node { -bool EmbeddingBagPacked::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool EmbeddingBagPacked::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto embBagPackedSumOp = ov::as_type_ptr(op); const auto embBagPackedOp = ov::as_type_ptr(op); if (!embBagPackedSumOp && !embBagPackedOp) { - errorMessage = "Node is not an instance of the v3::EmbeddingBagPackedSum or v15::EmbeddingBagPacked operations."; + errorMessage = + "Node is not an instance of the v3::EmbeddingBagPackedSum or v15::EmbeddingBagPacked operations."; return false; } } catch (...) { @@ -45,7 +49,8 @@ EmbeddingBagPacked::EmbeddingBagPacked(const std::shared_ptr& op, cons _reduction = Reduction::MEAN; break; default: - THROW_CPU_NODE_ERR("EmbeddingBagPacked does not support reduction mode: ", ov::as_string(packed_op->get_reduction())); + THROW_CPU_NODE_ERR("EmbeddingBagPacked does not support reduction mode: ", + ov::as_string(packed_op->get_reduction())); } } if (getInputShapeAtPort(INDICES_IDX).getRank() != 2ul) @@ -57,8 +62,10 @@ void EmbeddingBagPacked::initSupportedPrimitiveDescriptors() { return; std::string logPrefix = std::string("Layer EmbeddingBag with name '") + _layerName + "' "; - static const std::set supportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set supportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; auto inDataPrecision = getOriginalInputPrecisionAtPort(EMB_TABLE_IDX); if (one_of(inDataPrecision, ov::element::bf16, ov::element::f16)) @@ -67,14 +74,16 @@ void EmbeddingBagPacked::initSupportedPrimitiveDescriptors() { if (supportedPrecisions.find(inDataPrecision) == supportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } else { - static const std::set defaultSupportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set defaultSupportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; if (defaultSupportedPrecisions.find(inDataPrecision) == defaultSupportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } - std::vector inDataConfigurators({{LayoutType::ncsp, inDataPrecision}, - {LayoutType::ncsp, ov::element::i32}}); + std::vector inDataConfigurators( + {{LayoutType::ncsp, inDataPrecision}, {LayoutType::ncsp, ov::element::i32}}); if (inputShapes.size() > PER_SAMPLE_WEIGHTS_IDX) inDataConfigurators.push_back({LayoutType::ncsp, inDataPrecision}); @@ -91,7 +100,11 @@ void EmbeddingBagPacked::initFromInputs() { _indices = getSrcDataAtPortAs(INDICES_IDX); } -void EmbeddingBagPacked::getIndices(size_t embIndex, const int*& indices, size_t& size, int& weightsIdx, bool& withWeight) { +void EmbeddingBagPacked::getIndices(size_t embIndex, + const int*& indices, + size_t& size, + int& weightsIdx, + bool& withWeight) { if (static_cast(embIndex) >= _batch * _indicesPerBag) OPENVINO_THROW("Invalid embedding bag index."); @@ -112,20 +125,23 @@ bool EmbeddingBagPacked::isExecutable() const { } void EmbeddingBagPacked::execute(dnnl::stream strm) { - const auto *srcData = getSrcDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(0); const uint8_t* weightsData = nullptr; if (_withWeights) weightsData = getSrcDataAtPortAs(PER_SAMPLE_WEIGHTS_IDX); - const auto &inputMem = getParentEdgeAt(0)->getMemory(); - EmbeddingBag::execute(srcData, weightsData, inputMem.getDesc().getPrecision(), - inputMem.getStaticDims(), getDstMemoryAtPort(0)); + const auto& inputMem = getParentEdgeAt(0)->getMemory(); + EmbeddingBag::execute(srcData, + weightsData, + inputMem.getDesc().getPrecision(), + inputMem.getStaticDims(), + getDstMemoryAtPort(0)); } bool EmbeddingBagPacked::created() const { return getType() == Type::EmbeddingBagPacked; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h index 6a9d33fe3afccb..a018d1b48929e1 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_bag_packed.h @@ -15,7 +15,7 @@ class EmbeddingBagPacked : public Node, public EmbeddingBag { public: EmbeddingBagPacked(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -36,6 +36,6 @@ class EmbeddingBagPacked : public Node, public EmbeddingBag { size_t _indicesPerBag = 0; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp index 2a012c6b941831..8bd91799834bad 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.cpp @@ -2,17 +2,20 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "embedding_segments_sum.h" + #include -#include #include -#include "embedding_segments_sum.h" +#include + #include "openvino/opsets/opset3.hpp" namespace ov { namespace intel_cpu { namespace node { -bool EmbeddingSegmentsSum::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool EmbeddingSegmentsSum::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto embBagSegSumOp = ov::as_type_ptr(op); if (!embBagSegSumOp) { @@ -46,8 +49,10 @@ void EmbeddingSegmentsSum::initSupportedPrimitiveDescriptors() { return; std::string logPrefix = std::string("Layer EmbeddingBag with name '") + _layerName + "' "; - static const std::set supportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set supportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; auto inDataPrecision = getOriginalInputPrecisionAtPort(EMB_TABLE_IDX); if (one_of(inDataPrecision, ov::element::bf16, ov::element::f16)) @@ -56,8 +61,10 @@ void EmbeddingSegmentsSum::initSupportedPrimitiveDescriptors() { if (supportedPrecisions.find(inDataPrecision) == supportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } else { - static const std::set defaultSupportedPrecisions = - {ov::element::f32, ov::element::i8, ov::element::u8, ov::element::i32}; + static const std::set defaultSupportedPrecisions = {ov::element::f32, + ov::element::i8, + ov::element::u8, + ov::element::i32}; if (defaultSupportedPrecisions.find(inDataPrecision) == defaultSupportedPrecisions.end()) OPENVINO_THROW(logPrefix, "has unsupported precision: ", inDataPrecision.get_type_name()); } @@ -90,7 +97,11 @@ void EmbeddingSegmentsSum::initFromInputs() { } } -void EmbeddingSegmentsSum::getIndices(size_t embIndex, const int*& indices, size_t& size, int& weightsIdx, bool& withWeight) { +void EmbeddingSegmentsSum::getIndices(size_t embIndex, + const int*& indices, + size_t& size, + int& weightsIdx, + bool& withWeight) { if (embIndex >= static_cast(lastNumSegments_)) OPENVINO_THROW("Invalid embedding bag index."); @@ -143,20 +154,23 @@ bool EmbeddingSegmentsSum::isExecutable() const { } void EmbeddingSegmentsSum::execute(dnnl::stream strm) { - const auto *srcData = getSrcDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(0); const uint8_t* weightsData = nullptr; if (_withWeights) weightsData = getSrcDataAtPortAs(PER_SAMPLE_WEIGHTS_IDX); - const auto &inputMem = getParentEdgeAt(0)->getMemory(); - EmbeddingBag::execute(srcData, weightsData, inputMem.getDesc().getPrecision(), - inputMem.getStaticDims(), getDstMemoryAtPort(0)); + const auto& inputMem = getParentEdgeAt(0)->getMemory(); + EmbeddingBag::execute(srcData, + weightsData, + inputMem.getDesc().getPrecision(), + inputMem.getStaticDims(), + getDstMemoryAtPort(0)); } bool EmbeddingSegmentsSum::created() const { return getType() == Type::EmbeddingSegmentsSum; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h index bb312b4dd47246..984b9de68690b2 100644 --- a/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h +++ b/src/plugins/intel_cpu/src/nodes/embedding_segments_sum.h @@ -15,7 +15,7 @@ class EmbeddingSegmentsSum : public Node, public EmbeddingBag { public: EmbeddingSegmentsSum(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -45,6 +45,6 @@ class EmbeddingSegmentsSum : public Node, public EmbeddingBag { size_t indicesSize_ = 0; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 660db85cd61529..0374888e3d7fcb 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -26,6 +26,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseEqual, Algorithm::EltwiseExp, Algorithm::EltwiseFloor, + Algorithm::EltwiseFloorMod, Algorithm::EltwiseCeiling, Algorithm::EltwiseGeluErf, Algorithm::EltwiseGeluTanh, @@ -48,6 +49,8 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwisePowerStatic, Algorithm::EltwisePrelu, Algorithm::EltwiseRelu, + Algorithm::EltwiseRoundHalfAwayFromZero, + Algorithm::EltwiseRoundHalfToEven, Algorithm::EltwiseSelect, Algorithm::EltwiseSigmoid, Algorithm::EltwiseSoftSign, diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index cc42691950a3ff..9660178e1af4a4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -11,6 +11,7 @@ #include "nodes/executors/executor.hpp" #include "nodes/executors/memory_arguments.hpp" #include "utils/debug_capabilities.h" +#include "utils/cpu_utils.hpp" #include "nodes/executors/debug_messages.hpp" #include "nodes/executors/implementation_utils.hpp" #include "nodes/convert.h" @@ -201,9 +202,22 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, MemoryArgs memoryArgs; memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); memoryArgs[ARG_WEI] = memory.at(ARG_WEI); + + auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr(); + + // normalize weights to 2D + const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims(); + const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims); + + originalWeightsDesc = std::make_shared(originalWeightsDesc->getPrecision(), Shape{wgtDims2D}); + + auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc); + auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(aclfcAttrs.inputPrecision); + auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); + if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { const auto& inShape = memory.at(ARG_SRC_0)->getShape(); - const auto& wShape = memory.at(ARG_WEI)->getShape(); + const auto& wShape = originalWeightsDesc->getShape(); const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), @@ -214,6 +228,7 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); memoryArgs[ARG_DST] = memory.at(ARG_DST); } + // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); @@ -221,13 +236,6 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI); - MemoryPtr dstMemPtr = std::make_shared(context->getEngine(), - memory.at(ARG_WEI)->getDescPtr()->cloneWithNewPrecision(aclfcAttrs.inputPrecision)); - auto dstDesc = dstMemPtr->getDescPtr(); - auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); - auto weiDesc = memory.at(ARG_WEI)->getDescPtr(); - auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weiDesc); - if (isNeededReorder) { dnnl::impl::dim_t o_dim = 0; dnnl::impl::dim_t inner_dim = 1; diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp index 2bba0f5e73c0fe..de65176fb72235 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.cpp @@ -3,6 +3,7 @@ // #include "ref_convert.hpp" + #include "nodes/common/cpu_convert.h" namespace ov { @@ -13,9 +14,9 @@ bool CommonConvertExecutor::isSupported(ov::element::Type srcPrc, ov::element::T } bool CommonConvertExecutor::init(const ConvertParams& convertParams, - const MemoryDescPtr& srcDesc, - const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr& attr) { + const MemoryDescPtr& srcDesc, + const MemoryDescPtr& dstDesc, + const dnnl::primitive_attr& attr) { commonConvertParams = convertParams; return true; } @@ -32,5 +33,5 @@ void CommonConvertExecutor::exec(const std::vector& src, const std:: commonConvertParams.size); } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp index 337d377f3b3339..4bc3a709d2bcd2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_convert.hpp @@ -15,9 +15,11 @@ class CommonConvertExecutor : public ConvertExecutor { bool init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return implDescType; }; + impl_desc_type implType() const override { + return implDescType; + }; static bool isSupported(ov::element::Type srcPrc, ov::element::Type dstPrc); protected: @@ -26,7 +28,6 @@ class CommonConvertExecutor : public ConvertExecutor { const ExecutorContext::CPtr convertContext; }; - class CommonConvertExecutorBuilder : public ConvertExecutorBuilder { public: ~CommonConvertExecutorBuilder() = default; @@ -40,5 +41,5 @@ class CommonConvertExecutorBuilder : public ConvertExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp index 0e1d43b48f6224..dd0cea3d238a4e 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.cpp @@ -3,6 +3,7 @@ // #include "ref_opt_transpose.hpp" + #include "openvino/core/parallel.hpp" namespace ov { @@ -26,21 +27,15 @@ void transpose_to_0312(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& dst parallel_for3d(MB, DIM1, DIM2, [&](const int n, const int dim1, const int dim2) { for (int dim3 = 0; dim3 < DIM3; ++dim3) { - const int src_off = n * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; - const int dst_off = n * DIM1 * DIM2 * DIM3 + - dim3 * DIM1 * DIM2 + - dim1 * DIM2 + - dim2; + const int src_off = n * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; + const int dst_off = n * DIM1 * DIM2 * DIM3 + dim3 * DIM1 * DIM2 + dim1 * DIM2 + dim2; dst_data[dst_off] = src_data[src_off]; } }); } -template +template void transpose_to_04123(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& dstMemPtr) { const auto src_data = srcMemPtr->getDataAs(); auto dst_data = dstMemPtr->getDataAs(); @@ -52,23 +47,17 @@ void transpose_to_04123(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& ds parallel_for4d(MB, DIM1, DIM2, DIM3, [&](const int n, const int dim1, const int dim2, const int dim3) { for (int dim4 = 0; dim4 < DIM4; ++dim4) { - const int src_off = n * DIM1 * DIM2 * DIM3 * DIM4 + - dim1 * DIM2 * DIM3 * DIM4 + - dim2 * DIM3 * DIM4 + - dim3 * DIM4 + - dim4; - const int dst_off = n * DIM1 * DIM2 * DIM3 * DIM4 + - dim4 * DIM1 * DIM2 * DIM3 + - dim1 * DIM2 * DIM3 + - dim2 * DIM3 + - dim3; + const int src_off = + n * DIM1 * DIM2 * DIM3 * DIM4 + dim1 * DIM2 * DIM3 * DIM4 + dim2 * DIM3 * DIM4 + dim3 * DIM4 + dim4; + const int dst_off = + n * DIM1 * DIM2 * DIM3 * DIM4 + dim4 * DIM1 * DIM2 * DIM3 + dim1 * DIM2 * DIM3 + dim2 * DIM3 + dim3; dst_data[dst_off] = src_data[src_off]; } }); } -template +template void transpose_to_051234(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& dstMemPtr) { const auto src_data = srcMemPtr->getDataAs(); auto dst_data = dstMemPtr->getDataAs(); @@ -79,61 +68,61 @@ void transpose_to_051234(const int MB, const MemoryCPtr& srcMemPtr, MemoryPtr& d const int DIM4 = srcMemPtr->getStaticDims()[4]; const int DIM5 = srcMemPtr->getStaticDims()[5]; - parallel_for5d(MB, DIM1, DIM2, DIM3, DIM4, [&](const int n, const int dim1, const int dim2, const int dim3, const int dim4) { - for (int dim5 = 0; dim5 < DIM5; ++dim5) { - const int src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 + - dim1 * DIM2 * DIM3 * DIM4 * DIM5 + - dim2 * DIM3 * DIM4 * DIM5 + - dim3 * DIM4 * DIM5 + - dim4 * DIM5 + - dim5; - const int dst_off = n * DIM5 * DIM1 * DIM2 * DIM3 * DIM4 + - dim5 * DIM1 * DIM2 * DIM3 * DIM4 + - dim1 * DIM2 * DIM3 * DIM4 + - dim2 * DIM3 * DIM4 + - dim3 * DIM4 + - dim4; - - dst_data[dst_off] = src_data[src_off]; - } - }); + parallel_for5d(MB, + DIM1, + DIM2, + DIM3, + DIM4, + [&](const int n, const int dim1, const int dim2, const int dim3, const int dim4) { + for (int dim5 = 0; dim5 < DIM5; ++dim5) { + const int src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 + dim1 * DIM2 * DIM3 * DIM4 * DIM5 + + dim2 * DIM3 * DIM4 * DIM5 + dim3 * DIM4 * DIM5 + dim4 * DIM5 + dim5; + const int dst_off = n * DIM5 * DIM1 * DIM2 * DIM3 * DIM4 + dim5 * DIM1 * DIM2 * DIM3 * DIM4 + + dim1 * DIM2 * DIM3 * DIM4 + dim2 * DIM3 * DIM4 + dim3 * DIM4 + dim4; + + dst_data[dst_off] = src_data[src_off]; + } + }); } -template +template struct TransposeOptimizedEmitter { void operator()(TransposeContext& ctx) { switch (ctx.srcMemPtr->getStaticDims().size()) { - case 4: - transpose_to_0312(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); - break; - case 5: - transpose_to_04123(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); - break; - case 6: - transpose_to_051234(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); - break; - default: - OPENVINO_THROW("Transpose supports optimized execution with only 4D, 5D and 6D shapes"); + case 4: + transpose_to_0312(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); + break; + case 5: + transpose_to_04123(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); + break; + case 6: + transpose_to_051234(ctx.MB, ctx.srcMemPtr, ctx.dstMemPtr); + break; + default: + OPENVINO_THROW("Transpose supports optimized execution with only 4D, 5D and 6D shapes"); } } }; -} // namespace +} // namespace void RefOptimizedTransposeExecutor::exec(const std::vector& src, const std::vector& dst) { const size_t dataSize = src[0]->getDesc().getPrecision().size(); const int MB = src[0]->getStaticDims()[0]; TransposeContext ctx = {src[0], dst[0], MB}; - OV_SWITCH(intel_cpu, TransposeOptimizedEmitter, ctx, dataSize, + OV_SWITCH(intel_cpu, + TransposeOptimizedEmitter, + ctx, + dataSize, OV_CASE(1u, element_type_traits::value_type), OV_CASE(2u, element_type_traits::value_type), OV_CASE(4u, element_type_traits::value_type)); } -bool RefOptimizedTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool RefOptimizedTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp index be420bfb009e5a..65da099caa0f33 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp @@ -13,12 +13,14 @@ class RefOptimizedTransposeExecutor : public TransposeExecutor { public: using TransposeExecutor::TransposeExecutor; - bool init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) override; - void exec(const std::vector &src, const std::vector &dst) override; - impl_desc_type implType() const override { return impl_desc_type::ref; } + bool init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) override; + void exec(const std::vector& src, const std::vector& dst) override; + impl_desc_type implType() const override { + return impl_desc_type::ref; + } }; class RefOptimizedTransposeExecutorBuilder : public TransposeExecutorBuilder { @@ -27,12 +29,13 @@ class RefOptimizedTransposeExecutorBuilder : public TransposeExecutorBuilder { const std::vector& srcDescs, const std::vector& dstDescs) const override { static const std::vector> optimizedOrders = { - std::vector{0, 3, 1, 2}, - std::vector{0, 4, 1, 2, 3}, - std::vector{0, 5, 1, 2, 3, 4}, + std::vector{0, 3, 1, 2}, + std::vector{0, 4, 1, 2, 3}, + std::vector{0, 5, 1, 2, 3, 4}, }; if (srcDescs[0]->hasLayoutType(LayoutType::ncsp) && - std::find(optimizedOrders.begin(), optimizedOrders.end(), transposeParams.permuteParams.order) != optimizedOrders.end()) { + std::find(optimizedOrders.begin(), optimizedOrders.end(), transposeParams.permuteParams.order) != + optimizedOrders.end()) { return true; } DEBUG_LOG("RefOptimizedTransposeExecutor is not supported, because passed order is not optimized"); @@ -44,5 +47,5 @@ class RefOptimizedTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp index 8db8798ef8eaff..1716f008027fe9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.cpp @@ -3,8 +3,9 @@ // #include "ref_transpose.hpp" -#include "openvino/core/parallel.hpp" + #include "nodes/common/cpu_memcpy.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -27,7 +28,10 @@ static inline void parallel_step(size_t nDims, const VectorDims& dims, VectorDim } } -void RefTransposeExecutor::referenceExecute(const uint8_t* src_data, uint8_t* dst_data, jit_permute_config_params jcp, const int mb) { +void RefTransposeExecutor::referenceExecute(const uint8_t* src_data, + uint8_t* dst_data, + jit_permute_config_params jcp, + const int mb) { VectorDims dst_dims = jcp.dst_block_dims; const VectorDims dst_strides = jcp.dst_strides; const VectorDims src_strides = jcp.src_strides; @@ -70,13 +74,13 @@ void RefTransposeExecutor::exec(const std::vector& src, const std::v referenceExecute(src_data, dst_data, jcp, MB); } -bool RefTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool RefTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { jcp = TransposeExecutor::prepareParams(transposeParams.permuteParams); return true; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp index 206d610368a9df..00c1602c0bd119 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp @@ -11,13 +11,19 @@ namespace intel_cpu { class RefTransposeExecutor : public TransposeExecutor { public: using TransposeExecutor::TransposeExecutor; - static void referenceExecute(const uint8_t* src_data, uint8_t* dst_data, jit_permute_config_params jcp, const int mb); - bool init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) override; - void exec(const std::vector &src, const std::vector &dst) override; - impl_desc_type implType() const override { return impl_desc_type::ref; } + static void referenceExecute(const uint8_t* src_data, + uint8_t* dst_data, + jit_permute_config_params jcp, + const int mb); + bool init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) override; + void exec(const std::vector& src, const std::vector& dst) override; + impl_desc_type implType() const override { + return impl_desc_type::ref; + } + private: jit_permute_config_params jcp; }; @@ -35,5 +41,5 @@ class RefTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/convert.cpp index c8d7ce8addaf22..32141d53b10ee5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert.cpp @@ -4,4 +4,5 @@ #include "convert.hpp" -ov::intel_cpu::ConvertExecutor::ConvertExecutor(const ov::intel_cpu::ExecutorContext::CPtr context) : convertContext(context) {} \ No newline at end of file +ov::intel_cpu::ConvertExecutor::ConvertExecutor(const ov::intel_cpu::ExecutorContext::CPtr context) + : convertContext(context) {} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert.hpp b/src/plugins/intel_cpu/src/nodes/executors/convert.hpp index ce766663a0b653..dcb0bdde2ce219 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -24,8 +24,9 @@ class ConvertExecutor : public Executor { virtual bool init(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; virtual ~ConvertExecutor() = default; + protected: ConvertParams convertParams; const ExecutorContext::CPtr convertContext; @@ -45,5 +46,5 @@ class ConvertExecutorBuilder { using ConvertExecutorBuilderPtr = std::shared_ptr; using ConvertExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp index 504c310ca15124..5375bd21166cc4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert_list.cpp @@ -9,9 +9,8 @@ namespace intel_cpu { const std::vector& getConvertExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) + OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared())}; return descs; } @@ -45,5 +44,5 @@ ConvertExecutorPtr ConvertExecutorFactory::makeExecutor(const ConvertParams& con OPENVINO_THROW("Supported executor is not found"); } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp index a7ed05ceb634e4..9ea47f916d859f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convert_list.hpp @@ -4,17 +4,15 @@ #pragma once -#include "executor.hpp" - #include "convert.hpp" +#include "executor.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_convert.hpp" +# include "acl/acl_convert.hpp" #endif +#include "common/primitive_cache.hpp" #include "common/ref_convert.hpp" - #include "onednn/iml_type_mapper.h" -#include "common/primitive_cache.hpp" namespace ov { namespace intel_cpu { @@ -31,7 +29,8 @@ class ConvertExecutorFactory : public ExecutorFactoryLegacy { ConvertExecutorFactory(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getConvertExecutorsList()) { if (desc.builder->isSupported(convertParams, srcDesc, dstDesc)) { supportedDescs.push_back(desc); @@ -43,7 +42,7 @@ class ConvertExecutorFactory : public ExecutorFactoryLegacy { virtual ConvertExecutorPtr makeExecutor(const ConvertParams& convertParams, const MemoryDescPtr& srcDesc, const MemoryDescPtr& dstDesc, - const dnnl::primitive_attr &attr); + const dnnl::primitive_attr& attr); private: std::vector supportedDescs; @@ -53,5 +52,5 @@ class ConvertExecutorFactory : public ExecutorFactoryLegacy { using ConvertExecutorFactoryPtr = std::shared_ptr; using ConvertExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp index 26ae6ace59631b..222779a00ee18f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp @@ -4,25 +4,25 @@ #pragma once -#define UNSUPPORTED_SPARSE_WEIGHTS " sparse weights are not supported" +#define UNSUPPORTED_SPARSE_WEIGHTS " sparse weights are not supported" #define UNSUPPORTED_WEIGHTS_DECOMPRESSION " weights decompression is not supported" -#define UNSUPPORTED_POST_OPS " post ops are not supported" -#define UNSUPPORTED_NUMBER_OF_POSTOPS " the number of post ops is not supported" -#define UNSUPPORTED_TYPE_OF_POSTOPS " the type of post ops is not supported" -#define UNSUPPORTED_SRC_PRECISIONS " unsupported src precisions" -#define UNSUPPORTED_WEI_PRECISIONS " unsupported wei precisions" -#define UNSUPPORTED_DST_PRECISIONS " unsupported dst precisions" -#define UNSUPPORTED_ISA " unsupported isa" -#define UNSUPPORTED_SRC_RANK " unsupported src rank" -#define UNSUPPORTED_WEI_RANK " unsupported wei rank" -#define UNSUPPORTED_DST_RANK " unsupported dst rank" -#define UNSUPPORTED_DST_STRIDES " unsupported dst strides" -#define HEURISTICS_MISMATCH " heuristics mismatch" +#define UNSUPPORTED_POST_OPS " post ops are not supported" +#define UNSUPPORTED_NUMBER_OF_POSTOPS " the number of post ops is not supported" +#define UNSUPPORTED_TYPE_OF_POSTOPS " the type of post ops is not supported" +#define UNSUPPORTED_SRC_PRECISIONS " unsupported src precisions" +#define UNSUPPORTED_WEI_PRECISIONS " unsupported wei precisions" +#define UNSUPPORTED_DST_PRECISIONS " unsupported dst precisions" +#define UNSUPPORTED_ISA " unsupported isa" +#define UNSUPPORTED_SRC_RANK " unsupported src rank" +#define UNSUPPORTED_WEI_RANK " unsupported wei rank" +#define UNSUPPORTED_DST_RANK " unsupported dst rank" +#define UNSUPPORTED_DST_STRIDES " unsupported dst strides" +#define HEURISTICS_MISMATCH " heuristics mismatch" -#define VERIFY(condition, ...) \ - do { \ - if (!(condition)) { \ +#define VERIFY(condition, ...) \ + do { \ + if (!(condition)) { \ DEBUG_LOG(__VA_ARGS__); \ - return false; \ - } \ + return false; \ + } \ } while (0) diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp index 23e0910bd0c82c..e485815e950af4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp @@ -5,8 +5,5 @@ #include "deconv.hpp" namespace ov { -namespace intel_cpu { - - -} // namespace intel_cpu -} // namespace ov +namespace intel_cpu {} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp index c632cc0cf99ad1..11920c0ab35b49 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv.hpp @@ -34,11 +34,11 @@ class DeconvExecutor { virtual bool init(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; virtual void exec(const std::vector& src, const std::vector& dst, - const void *post_ops_data_) = 0; + const void* post_ops_data_) = 0; virtual ~DeconvExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -53,12 +53,14 @@ using DeconvExecutorCPtr = std::shared_ptr; class DeconvExecutorBuilder { public: ~DeconvExecutorBuilder() = default; - virtual bool isSupported(const DeconvAttrs& convAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const = 0; + virtual bool isSupported(const DeconvAttrs& convAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const = 0; virtual DeconvExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0; }; using DeconvExecutorBuilderPtr = std::shared_ptr; using DeconvExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp index f5b897c2d1b6e1..c093057e47413f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getDeconvExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp index 4c63a565aac2e0..fd114094303808 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/deconv_list.hpp @@ -4,15 +4,14 @@ #pragma once -#include "executor.hpp" - #include "deconv.hpp" +#include "executor.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_deconv.hpp" +# include "acl/acl_deconv.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -29,7 +28,8 @@ class DeconvExecutorFactory : public ExecutorFactoryLegacy { DeconvExecutorFactory(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getDeconvExecutorsList()) { if (desc.builder->isSupported(deconvAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -41,7 +41,7 @@ class DeconvExecutorFactory : public ExecutorFactoryLegacy { virtual DeconvExecutorPtr makeExecutor(const DeconvAttrs& deconvAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const dnnl::primitive_attr& attr) { auto build = [&](const DeconvExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(deconvAttrs, srcDescs, dstDescs, attr)) { @@ -75,5 +75,5 @@ class DeconvExecutorFactory : public ExecutorFactoryLegacy { using DeconvExecutorFactoryPtr = std::shared_ptr; using DeconvExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp index a611e94f617e44..27fa7dd38d7a99 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_aliases.hpp @@ -4,8 +4,8 @@ #pragma once -#include #include +#include namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp index 8f9d7ad0805e41..61aca683a37687 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp @@ -157,8 +157,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const ConvAttrs& attrs, one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); - DnnlPostOpsComposer - dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, {}, attrs.withBias, outputDataType); + DnnlPostOpsComposer dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, memory, outputDataType); return dnnlpoc.compose(); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp index 266e78b3d46c77..db5c8bed2e43e1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected.hpp @@ -8,12 +8,12 @@ #include #include "cpu_memory.h" -#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp" -#include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp" +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/dnnl/dnnl_aliases.hpp" +#include "nodes/executors/dnnl/dnnl_utils.hpp" #include "nodes/executors/executor.hpp" -#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/memory_arguments.hpp" +#include "post_ops.hpp" namespace ov { namespace intel_cpu { @@ -73,7 +73,7 @@ class DnnlFCExecutor : public Executor { return; } const auto newPrimMemDesc = m_primitive->scratchPadDesc(); - m_scratchPadMemory = m_context->getScratchPad(numaNodeID)->createScratchPadMem(newPrimMemDesc); + m_scratchPadMemory = m_context->getScratchPad()->createScratchPadMem(newPrimMemDesc); m_primArgs[DNNL_ARG_SCRATCHPAD] = m_scratchPadMemory->getPrimitive(); if (m_primArgs.count(DNNL_ARG_WEIGHTS)) { @@ -123,7 +123,8 @@ class DnnlFCExecutor : public Executor { if (currentPrimitive && currentPrimitive->weightsDesc()->isCompatible(*newPrimMemDesc)) return; - originalMemDesc = Primitive::makeTransposedWeightDescriptor(originalMemDesc, newPrimMemDesc, m_attrs.weightsNonTransposed); + originalMemDesc = + Primitive::makeTransposedWeightDescriptor(originalMemDesc, newPrimMemDesc, m_attrs.weightsNonTransposed); const auto weiMemory = utils::prepareWeightsMemory(originalMemDesc, newPrimMemDesc, memory, m_context, true); m_primArgs[DNNL_ARG_WEIGHTS] = weiMemory->getPrimitive(); @@ -139,13 +140,11 @@ class DnnlFCExecutor : public Executor { if (currentPrimitive && currentPrimitive->scratchPadDesc()->isCompatible(*newPrimMemDesc)) return; - m_scratchPadMemory = m_context->getScratchPad(curNumaNode)->createScratchPadMem(newPrimMemDesc); + m_scratchPadMemory = m_context->getScratchPad()->createScratchPadMem(newPrimMemDesc); m_primArgs[DNNL_ARG_SCRATCHPAD] = m_scratchPadMemory->getPrimitive(); } - void updateMemory(const PrimitivePtr currentPrimitive, - const PrimitivePtr newPrimitive, - const MemoryArgs& memory) { + void updateMemory(const PrimitivePtr currentPrimitive, const PrimitivePtr newPrimitive, const MemoryArgs& memory) { const auto& srcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(memory.at(ARG_SRC)->getDescPtr()); const auto& weiDesc = MemoryDescUtils::convertToDnnlMemoryDesc(memory.at(ARG_WEI)->getDescPtr()); const auto& dstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(memory.at(ARG_DST)->getDescPtr()); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index fcb70d4753b2ce..52434a1eeb8461 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "nodes/executors/executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -72,9 +74,8 @@ bool DnnlFCPrimitive::Key::operator==(const Key& rhs) const { result = result && dst && rhs.dst && dst->getDnnlDesc() == rhs.dst->getDnnlDesc(); } - result = result && *attr.get() == *rhs.attr.get() && - sparseWeights == rhs.sparseWeights && - modelType == rhs.modelType; + result = + result && *attr.get() == *rhs.attr.get() && sparseWeights == rhs.sparseWeights && modelType == rhs.modelType; return result; } @@ -115,9 +116,10 @@ DnnlMemoryDescPtr DnnlFCPrimitive::makeTransposedWeightDescriptor(const DnnlMemo return srcDesc; const auto& weiDesc = srcDesc->getDnnlDesc(); - const auto reorderedWeiDesc = - dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; - const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); + auto wDims = weiDesc.get_dims(); + dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims); + + const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -140,12 +142,11 @@ bool DnnlFCPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputT return false; } -bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose) { +static bool useDynamicQuantizationImpl(size_t dqGroupSize, + const MemoryDescPtr srcDesc, + const MemoryDescPtr weightsDesc, + const MemoryArgs& memory, + bool needTranspose) { if (dqGroupSize == 0) return false; @@ -155,6 +156,9 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (srcDesc->getPrecision() != ov::element::f32) return false; + + MemoryCPtr zpPtr = + memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS) ? memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS) : nullptr; // For dynamic quantization, VNNI accumulation requires weight to be unsigned. // To support dynamic quantization with weights symmetrically quantized as i8/i4 // w/o zero-point, we will transform weight to u8/u4 weight with zp 128/8. @@ -177,11 +181,15 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (weightsDesc->getPrecision() == ov::element::u4) { int ic = weightsDesc->getShape().getStaticDims()[1]; int minGroupSize = INT_MAX; + + MemoryCPtr scalesPtr = memory.count(ARG_WEI | ARG_ATTR_SCALES) ? memory.at(ARG_WEI | ARG_ATTR_SCALES) : nullptr; + if (scalesPtr && scalesPtr->getShape().getRank() == 3) { auto scalesDims = scalesPtr->getShape().getStaticDims(); auto groupsNum = needTranspose ? scalesDims[1] : scalesDims[0]; minGroupSize = ic / groupsNum; } + if (zpPtr && zpPtr->getShape().getRank() == 3) { auto zpDims = zpPtr->getShape().getStaticDims(); int groupsNum = needTranspose ? zpDims[1] : zpDims[0]; @@ -196,11 +204,6 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, return true; } -template -static std::vector normalizeDimsTo2D(const std::vector& dims) { - return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies()), dims[dims.size() - 1]}; -} - static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, @@ -211,33 +214,30 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); const auto& originalDims = dstDesc->getShape().getMinDims(); - const auto& dims = normalizeDimsTo2D(originalDims); + const auto& dims = reshapeDownToRank<2>(originalDims); auto isINT8 = one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); - DnnlPostOpsComposer dnnlpoc(postOps, - context->getEngine(), - dims, - dims.size() - 1, - isINT8, - 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), - outputDataType); - - if (attrs.decompressionMultiplyPtr) { - auto dstPrc = attrs.decompressionMultiplyPtr->getPrecision(); + DnnlPostOpsComposer + dnnlpoc(postOps, context->getEngine(), dims, dims.size() - 1, isINT8, 1 << 0, memory, outputDataType); + + if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) { + auto dstPrc = memory.at(ARG_WEI | ARG_ATTR_SCALES)->getPrecision(); if (dstPrc != f8e8m0 || useDynamicQuantization) dstPrc = ov::element::f32; - dnnlpoc.appendDecompressionScales(attrs.decompressionMultiplyPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), !attrs.weightsNonTransposed, dstPrc); } - if (attrs.decompressionSubtractPtr) { + + if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) { auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32; - dnnlpoc.appendDecompressionZeroPoints(attrs.decompressionSubtractPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionZeroPoints(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS), + !attrs.weightsNonTransposed, + dstPrc); } + if (useDynamicQuantization) { auto wei_precision = weiDesc->getPrecision(); bool is_symmetric_weights = (wei_precision == ov::element::i8) || (wei_precision == ov::element::i4); @@ -261,7 +261,7 @@ static dnnl::memory::desc normalizeDescriptor(const dnnl::memory::desc& desc) { const auto& dims = desc.get_dims(); if (dims.size() > 2) - return desc.reshape(normalizeDimsTo2D(dims)); + return desc.reshape(reshapeDownToRank<2>(dims)); return desc; } @@ -276,12 +276,13 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons const bool useWeightsDecompression) { const auto normalizedInputDesc = normalizeDescriptor(inputDesc); const auto normalizedOutputDesc = normalizeDescriptor(outputDesc); + const auto normalizedWeightDesc = normalizeDescriptor(weightDesc); const auto indt = normalizedInputDesc.get_data_type(); auto wdt = indt; if (useWeightsDecompression) { - wdt = weightDesc.get_data_type(); + wdt = normalizedWeightDesc.get_data_type(); // dynamic quantization with symmetric quantized weights needs unsigned weights uint64_t dynQuantGroupSize = 0; @@ -297,8 +298,8 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons } const dnnl::memory::desc weightsDesc = - useSparseWeights ? dnnl::memory::desc().sparse_desc(weightDesc.get_dims(), wdt) - : dnnl::memory::desc(weightDesc.get_dims(), wdt, memory::format_tag::any); + useSparseWeights ? dnnl::memory::desc().sparse_desc(normalizedWeightDesc.get_dims(), wdt) + : dnnl::memory::desc(normalizedWeightDesc.get_dims(), wdt, memory::format_tag::any); return dnnl::inner_product_forward::primitive_desc(engine, dnnl::prop_kind::forward_inference, @@ -387,8 +388,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs& useWeightsDecompression && useDynamicQuantizationImpl(attrs.dynamicQuantizationGroupSize, srcDesc, weiDesc, - attrs.decompressionMultiplyPtr, - attrs.decompressionSubtractPtr, + memory, !attrs.weightsNonTransposed); const auto postOpData = createPrimitiveAttrs(attrs, postOps, memory, context, useDynamicQuantization); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp index 5295b9655066cc..21247f149ca69f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp @@ -75,13 +75,6 @@ class DnnlFCPrimitive { const DnnlShapeAgnosticDataPtr& shapeAgnosticData); private: - static bool useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose); - dnnl::stream m_stream; dnnl::primitive_desc m_primDesc; impl_desc_type m_implType; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp index 1b8646c858e532..86b22607111833 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp @@ -27,6 +27,7 @@ #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/matmul_config.hpp" #include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -104,10 +105,10 @@ DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const Dnnl const auto& weiDesc = srcDesc->getDnnlDesc(); auto wDims = weiDesc.get_dims(); auto wDataType = weiDesc.get_data_type(); - std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]); + dnnl::memory::dims wDims2D = reshapeDownToRank<2>(wDims); const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba; - const auto transposedWeiDesc = dnnl::memory::desc{wDims, wDataType, format}; + const auto transposedWeiDesc = dnnl::memory::desc{wDims2D, wDataType, format}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -128,15 +129,8 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs, one_of(srcDesc->getPrecision(), ov::element::u8, ov::element::i8) && weiDesc->getPrecision() == ov::element::i8; auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); - DnnlPostOpsComposer dnnlpoc(postOps, - context->getEngine(), - dims, - dims.size() - 1, - isINT8, - 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), - outputDataType); + DnnlPostOpsComposer + dnnlpoc(postOps, context->getEngine(), dims, dims.size() - 1, isINT8, 1 << 0, memory, outputDataType); return dnnlpoc.compose(); } @@ -185,8 +179,7 @@ static dnnl::matmul::primitive_desc createDescriptorInternal(const dnnl::memory: wdt = memory::data_type::s8; } - const dnnl::memory::desc weightsDesc = - dnnl::memory::desc(weiDims, wdt, memory::format_tag::any); + const dnnl::memory::desc weightsDesc = dnnl::memory::desc(weiDims, wdt, memory::format_tag::any); return dnnl::matmul::primitive_desc(engine, inputsDesc, weightsDesc, newBiasDesc, outputsDesc, attr); } @@ -262,7 +255,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& biasDesc = memory.at(ARG_BIAS)->getDescPtr(); auto dstDesc = memory.at(ARG_DST)->getDescPtr(); - MatMulAttrs mmAttrs{false, false, attrs.dequantizationScales}; + MatMulAttrs mmAttrs{false, false}; const auto postOpData = createPrimitiveAttrs(mmAttrs, postOps, memory, context, false); @@ -335,7 +328,8 @@ DnnlMatMulPrimitive::DnnlMatMulPrimitive(const Key& key, m_prim(primitive(m_primDesc)) {} void DnnlMatMulPrimitive::execute(const dnnl_primitive_args& primArgs) const { - std::cout << "Executing MM primitive" << "\n"; + std::cout << "Executing MM primitive" + << "\n"; m_prim.execute(m_stream, primArgs); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp index d76e1984bd87d9..6a1b128be307ce 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp @@ -12,8 +12,7 @@ namespace ov { namespace intel_cpu { struct DnnlShapeAgnosticData { - DnnlShapeAgnosticData(DnnlPrimitiveAttrs primAttrs) - : primAttrs(std::move(primAttrs)) {} + DnnlShapeAgnosticData(DnnlPrimitiveAttrs primAttrs) : primAttrs(std::move(primAttrs)) {} DnnlPrimitiveAttrs primAttrs; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp index fa273ac3d6c3ff..f23fd317d3546d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp @@ -8,8 +8,8 @@ #include #include "cpu_memory.h" -#include "memory_desc/dnnl_memory_desc.h" #include "memory_desc/cpu_memory_desc_utils.h" +#include "memory_desc/dnnl_memory_desc.h" #include "nodes/executors/executor.hpp" #include "nodes/reorder.h" #include "utils/cpu_utils.hpp" @@ -79,9 +79,9 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr srcWeightDesc, auto globalWeightCache = context->getWeightsCache(); MemoryPtr ptr; - if (globalWeightCache && - dnnl::memory::format_kind::blocked == dstWeightDesc->getDnnlDesc().get_format_kind()) { - ptr = *globalWeightCache->findOrCreate(DnnlExtensionUtils::computeWeightsStringHash(weightsMem, dstWeightDesc), create); + if (globalWeightCache && dnnl::memory::format_kind::blocked == dstWeightDesc->getDnnlDesc().get_format_kind()) { + ptr = *globalWeightCache->findOrCreate(DnnlExtensionUtils::computeWeightsStringHash(weightsMem, dstWeightDesc), + create); } else { ptr = create(); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp index 12bce382424e5c..8e7c470984b4f2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { EltwiseExecutor::EltwiseExecutor(const ExecutorContext::CPtr context) : context(context) {} -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp index 4b1271c49d5df0..b33c0eca10dae7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -19,10 +19,7 @@ struct EltwiseData { float gamma; bool operator==(const EltwiseData& rhs) const noexcept { - return algo == rhs.algo && - onednnAlgorithm == rhs.onednnAlgorithm && - alpha == rhs.alpha && - beta == rhs.beta && + return algo == rhs.algo && onednnAlgorithm == rhs.onednnAlgorithm && alpha == rhs.alpha && beta == rhs.beta && gamma == rhs.gamma; } }; @@ -34,24 +31,21 @@ struct EltwiseAttrs { float gamma; EltwiseAttrs() : algorithm(Algorithm::Default), alpha(0), beta(0), gamma(0) {} - EltwiseAttrs(Algorithm algorithm, float alpha, float beta, float gamma) : algorithm(algorithm), alpha(alpha), beta(beta), gamma(gamma) {} + EltwiseAttrs(Algorithm algorithm, float alpha, float beta, float gamma) + : algorithm(algorithm), + alpha(alpha), + beta(beta), + gamma(gamma) {} bool operator==(const EltwiseAttrs& rhs) const { bool retVal = true; - retVal = algorithm == rhs.algorithm && - alpha == rhs.alpha && - beta == rhs.beta && - gamma == rhs.gamma; + retVal = algorithm == rhs.algorithm && alpha == rhs.alpha && beta == rhs.beta && gamma == rhs.gamma; return retVal; } }; -enum class EltwisePostOpType { - Undefined, - Eltwise, - Dnnl -}; +enum class EltwisePostOpType { Undefined, Eltwise, Dnnl }; class EltwisePostOp { public: @@ -72,17 +66,20 @@ class EltwisePostOp { EltwisePostOpType type = EltwisePostOpType::Undefined; - bool operator==(const EltwisePostOp &rhs) const { - if (type != rhs.type) { return false; } + bool operator==(const EltwisePostOp& rhs) const { + if (type != rhs.type) { + return false; + } bool ret = true; switch (type) { - case EltwisePostOpType::Eltwise: - ret = eltwise == rhs.eltwise; - break; - case EltwisePostOpType::Dnnl: - ret = dnnlPostOps == rhs.dnnlPostOps; - break; - default: assert(!"unsupported eltwise post operation type"); + case EltwisePostOpType::Eltwise: + ret = eltwise == rhs.eltwise; + break; + case EltwisePostOpType::Dnnl: + ret = dnnlPostOps == rhs.dnnlPostOps; + break; + default: + assert(!"unsupported eltwise post operation type"); } return ret; } @@ -96,7 +93,9 @@ class EltwiseExecutor { const std::vector& dstDescs, const std::vector& postOps) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual ~EltwiseExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -121,5 +120,5 @@ class EltwiseExecutorBuilder { using EltwiseExecutorBuilderPtr = std::shared_ptr; using EltwiseExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp index 1bd6647310d387..5b9479bdf502b6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.cpp @@ -10,11 +10,10 @@ namespace intel_cpu { const std::vector& getEltwiseExecutorsList() { static std::vector descs = { OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - OV_CPU_INSTANCE_SHL(ExecutorType::Shl, std::make_shared()) - }; + OV_CPU_INSTANCE_SHL(ExecutorType::Shl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp index 618e3499dc10a7..ac5c27c0ad36dc 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/eltwise_list.hpp @@ -4,19 +4,18 @@ #pragma once -#include "executor.hpp" - #include "eltwise.hpp" +#include "executor.hpp" #if defined(OV_CPU_WITH_ACL) -#include "aarch64/jit_eltwise.hpp" -#include "acl/acl_eltwise.hpp" +# include "aarch64/jit_eltwise.hpp" +# include "acl/acl_eltwise.hpp" #endif #if defined(OV_CPU_WITH_SHL) -#include "shl/shl_eltwise.hpp" +# include "shl/shl_eltwise.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -31,9 +30,10 @@ const std::vector& getEltwiseExecutorsList(); class EltwiseExecutorFactory : public ExecutorFactoryLegacy { public: EltwiseExecutorFactory(const EltwiseAttrs& eltwiseAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getEltwiseExecutorsList()) { if (desc.builder->isSupported(eltwiseAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -43,9 +43,9 @@ class EltwiseExecutorFactory : public ExecutorFactoryLegacy { ~EltwiseExecutorFactory() = default; virtual EltwiseExecutorPtr makeExecutor(const EltwiseAttrs& eltwiseAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const std::vector& postOps) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const std::vector& postOps) { auto build = [&](const EltwiseExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(eltwiseAttrs, srcDescs, dstDescs, postOps)) { @@ -84,5 +84,5 @@ class EltwiseExecutorFactory : public ExecutorFactoryLegacy { using EltwiseExecutorFactoryPtr = std::shared_ptr; using EltwiseExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.cpp b/src/plugins/intel_cpu/src/nodes/executors/executor.cpp index 236f51c6d16149..399dab3d5499b9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.cpp @@ -2,15 +2,17 @@ // SPDX-License-Identifier: Apache-2.0 // -#include - #include "executor.hpp" +#include + namespace ov { namespace intel_cpu { std::string ExecutorTypeToString(const ExecutorType type) { -#define CASE(_type) case ExecutorType::_type: return #_type; +#define CASE(_type) \ + case ExecutorType::_type: \ + return #_type; switch (type) { CASE(Undefined); CASE(Graph); @@ -27,7 +29,10 @@ std::string ExecutorTypeToString(const ExecutorType type) { } ExecutorType ExecutorTypeFromString(const std::string& typeStr) { -#define CASE(_type) if (typeStr == #_type) { return ExecutorType::_type; } +#define CASE(_type) \ + if (typeStr == #_type) { \ + return ExecutorType::_type; \ + } CASE(Undefined); CASE(Graph); CASE(Common); @@ -41,5 +46,5 @@ ExecutorType ExecutorTypeFromString(const std::string& typeStr) { return ExecutorType::Undefined; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp index 5b9df5a6e77a55..16a419c95d5efc 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp @@ -4,15 +4,15 @@ #pragma once -#include "openvino/core/except.hpp" -#include "openvino/core/visibility.hpp" #include #include "cache/multi_cache.h" #include "cpu_memory.h" #include "graph_context.h" -#include "onednn/iml_type_mapper.h" #include "memory_arguments.hpp" +#include "onednn/iml_type_mapper.h" +#include "openvino/core/except.hpp" +#include "openvino/core/visibility.hpp" namespace ov { namespace intel_cpu { @@ -24,25 +24,25 @@ namespace intel_cpu { #endif #if defined(OV_CPU_WITH_ACL) -# if defined(OPENVINO_ARCH_ARM) -# define OV_CPU_INSTANCE_ACL32(...) {__VA_ARGS__}, -# else -# define OV_CPU_INSTANCE_ACL32(...) -# endif -# if defined(OPENVINO_ARCH_ARM64) -# define OV_CPU_INSTANCE_ACL64(...) {__VA_ARGS__}, -# else -# define OV_CPU_INSTANCE_ACL64(...) -# endif -# if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) -# define OV_CPU_INSTANCE_ACL(...) {__VA_ARGS__}, -# else -# define OV_CPU_INSTANCE_ACL(...) -# endif +# if defined(OPENVINO_ARCH_ARM) +# define OV_CPU_INSTANCE_ACL32(...) {__VA_ARGS__}, +# else +# define OV_CPU_INSTANCE_ACL32(...) +# endif +# if defined(OPENVINO_ARCH_ARM64) +# define OV_CPU_INSTANCE_ACL64(...) {__VA_ARGS__}, +# else +# define OV_CPU_INSTANCE_ACL64(...) +# endif +# if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) +# define OV_CPU_INSTANCE_ACL(...) {__VA_ARGS__}, +# else +# define OV_CPU_INSTANCE_ACL(...) +# endif #else -# define OV_CPU_INSTANCE_ACL32(...) -# define OV_CPU_INSTANCE_ACL64(...) -# define OV_CPU_INSTANCE_ACL(...) +# define OV_CPU_INSTANCE_ACL32(...) +# define OV_CPU_INSTANCE_ACL64(...) +# define OV_CPU_INSTANCE_ACL(...) #endif #if defined(OV_CPU_WITH_DNNL) @@ -72,28 +72,11 @@ namespace intel_cpu { #define OV_CPU_INSTANCE_COMMON(...) {__VA_ARGS__}, // @todo another option is to determine shape relation by executor type -enum class ShapeTolerance { - Agnostic, - Dependant -}; +enum class ShapeTolerance { Agnostic, Dependant }; -enum class ExecutorType { - Undefined, - Graph, - Common, - jit_x64, - Dnnl, - Acl, - Mlas, - jit_aarch64, - Shl -}; +enum class ExecutorType { Undefined, Graph, Common, jit_x64, Dnnl, Acl, Mlas, jit_aarch64, Shl }; -enum class OperationType { - FullyConnected, - MatMul, - Convolution -}; +enum class OperationType { FullyConnected, MatMul, Convolution }; std::string ExecutorTypeToString(const ExecutorType type); ExecutorType ExecutorTypeFromString(const std::string& typeStr); @@ -112,8 +95,10 @@ class ExecutorContext { engine(graphContext->getEngine()), implPriorities(implPriorities), privateWeighCache(std::move(privateWeighCache)), - numNumaNodes(graphContext->getNumNumaNodes()) - {} + numNumaNodes(graphContext->getNumNumaNodes()) { + auto cpuStreamsExecutor = graphContext->getCPUStreamExecutor(); + curNumaNodeId = std::max(0, cpuStreamsExecutor ? cpuStreamsExecutor->get_numa_node_id() : curNumaNodeId); + } MultiCachePtr getRuntimeCache() const { auto runtimeCachePtr = runtimeCache.lock(); @@ -121,12 +106,8 @@ class ExecutorContext { return runtimeCachePtr; } - DnnlScratchPadPtr getScratchPad(int subStreamID = 0) const { - if (subStreamID < 0) - subStreamID = 0; - if (subStreamID >= numNumaNodes - 1) - subStreamID = numNumaNodes - 1; - return scratchPads[subStreamID]; + DnnlScratchPadPtr getScratchPad() const { + return scratchPads[curNumaNodeId]; } std::shared_ptr> getPrivateWeighCache() const { @@ -156,6 +137,7 @@ class ExecutorContext { // @todo remove after global cache is used exclusevly std::shared_ptr> privateWeighCache; int numNumaNodes; + int curNumaNodeId = -1; }; class ExecutorFactoryLegacy { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp index 09b3b33cfe6b2f..cd9bcaf7a119f7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_config.hpp @@ -4,9 +4,8 @@ #pragma once -#include "post_ops.hpp" #include "memory_arguments.hpp" -#include "printers.hpp" +#include "post_ops.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp index 419ab4abf52cd7..dd05cc58d43c32 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_factory.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2022 Intel Corporation +// Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -6,50 +6,21 @@ #include #include -#include #include "executor.hpp" -#include "nodes/executors/implementations.hpp" #include "nodes/executors/executor_config.hpp" #include "nodes/executors/executor_implementation.hpp" #include "nodes/executors/graph_emitter.hpp" +#include "nodes/executors/implementations.hpp" #include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/printers.hpp" -#include "openvino/core/except.hpp" +#include "nodes/executors/variable_executor.hpp" #include "post_ops.hpp" namespace ov { namespace intel_cpu { -using namespace executor; - -template -static ExecutorPtr fallback(const executor::Config& config, - const executor::Config& fallbackConfig, - const MemoryArgs& memory, - const ExecutorContext::CPtr context, - const std::string& name) { - DEBUG_LOG("Falling back to graph executor for ", - name, - ". Original config: ", - config, - " new config:", - fallbackConfig); - - GraphEmitter graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name); - - const auto& graphExecutor = - graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context) - .ensureAttrsMatch() - .ensureSrcDescsMatch() - .ensureDstDescsMatch() - .ensurePostOpsMatch() - .emit(); - (void)graphExecutor; - OPENVINO_THROW("Fallback logic is not implemented yet"); // return graphExecutor; -} - -template +template class ExecutorFactory { public: using ExecutorImplementationRef = std::reference_wrapper>; @@ -62,9 +33,7 @@ class ExecutorFactory { : m_attrs(attrs), m_postOps(postOps), m_context(context), - m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)), - m_implementationRequiresFallback(m_suitableImplementations.size(), true), - m_executors(m_suitableImplementations.size()) {} + m_suitableImplementations(filter(m_attrs, m_postOps, descriptors, implementationPriority)) {} /** * @brief Retrieves the proper memory descriptors based on the provided memory descriptors. @@ -95,104 +64,42 @@ class ExecutorFactory { } /** - * @brief Preconfigures an executor based on the provided memory arguments. - * - * Preconfigures an executor by selecting an appropriate implementation based on the provided - * memory arguments and by creating an executor using the implementation. - * - * @param memory The memory parameters used for selecting the appropriate executor implementation. - * - * @note The main use case is to offload executor data preparation (i.e. weights packing) - * From the make() call - * @todo Currently supports creating a single executor. - * For some nodes it can be worth to preconfigure all the executors. - */ - void preconfigure(const MemoryArgs& memory) { - executor::Config config{memoryDescsFromMemory(memory), m_attrs, m_postOps}; - - cacheFallbackStatus(config); - - const size_t implId = select(memory, 0); - const auto& impl = m_suitableImplementations[implId].get(); - DEBUG_LOG("Preconfiguring executor: ", impl.name()); - - if (m_implementationRequiresFallback[implId]) { - if (auto fallbackConfig = impl.requiresFallback(config)) { - fallback(config, *fallbackConfig, memory, m_context, impl.name()); - } - } - - (void)create(implId, memory, m_context); - } - - /** - * @brief Creates an Executor instance based on provided memory arguments. + * @brief Creates an Executor instance based on the provided memory arguments. * - * Creates an Executor instance using the provided MemoryArgs, selecting an appropriate implementation - * based on the characteristics of the memory. It handles fallback scenarios if necessary and updates the executor - * with the given memory information. + * Depending on the number of available implementations, returns: + * - VariableExecutor, if the number of implementations is two or more + * - Simple Executor, if there is only one available implementation * * @param memory memory arguments. * * @return A shared pointer to the created Executor. - * - * The function follows the steps below: - * - Selects an implementation based on the provided memory using the select() function. - * - Retrieves the selected implementation and checks if fallback is required. - * - If fallback is required, it creates a fallback configuration and returns a fallback executor. - * - Otherwise creates the executor using the selected implementation. - * - Updates the executor with the given memory information. - * */ - ExecutorPtr make(MemoryArgs& memory) { - auto createExec = [this](MemoryArgs& memory, size_t implId) -> ExecutorPtr { - const auto& impl = m_suitableImplementations[implId].get(); - if (m_implementationRequiresFallback[implId]) { - executor::Config config{memoryDescsFromMemory(memory), m_attrs, m_postOps}; - if (auto fallbackConfig = impl.requiresFallback(config)) { - return fallback(config, *fallbackConfig, memory, m_context, impl.name()); - } - } - const auto executor = create(implId, memory, m_context); - if (!executor->update(memory)) { - return nullptr; + ExecutorPtr make(const MemoryArgs& memory) { + // only single executor is available + if (m_suitableImplementations.size() == 1) { + auto config = GraphEmitter::createConfig(memory, m_attrs, m_postOps); + + const auto& theOnlyImplementation = m_suitableImplementations.front().get(); + + if (const auto fallbackConfig = theOnlyImplementation.requiresFallback(config)) { + return GraphEmitter::fallback(config, + *fallbackConfig, + memory, + m_context, + theOnlyImplementation.name()); } - return executor; - }; - - auto implId = select(memory, 0); - auto executor = createExec(memory, implId); - while (!executor) { - implId = select(memory, ++implId); - executor = createExec(memory, implId); - } - return executor; - } -private: - static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) { - MemoryDescArgs memoryDescs; - memoryDescs.reserve(memory.size()); - - for (const auto& mem : memory) { - memoryDescs[mem.first] = mem.second->getDescPtr(); + return theOnlyImplementation.create(m_attrs, m_postOps, memory, m_context); } - return memoryDescs; - } - - /** - * @brief Caches the fallback status for each suitable implementation. - */ - void cacheFallbackStatus(const executor::Config& config) { - std::transform(m_suitableImplementations.begin(), - m_suitableImplementations.end(), - m_implementationRequiresFallback.begin(), - [&config](const ExecutorImplementationRef& impl) { - return impl.get().requiresFallback(config); - }); + return std::make_shared>(memory, + m_attrs, + m_postOps, + m_context, + m_suitableImplementations); } +private: /** * @brief Filters and retrieves suitable implementations based on the provided executor configuration. * @@ -205,11 +112,10 @@ class ExecutorFactory { * @note If an implementation is shape agnostic, no further implementations with lower * priority are considered. */ - static std::vector filter( - const Attrs& attrs, - const PostOps& postOps, - const MemoryDescArgs& descs, - const std::string& implementationPriority = {}) { + static std::vector filter(const Attrs& attrs, + const PostOps& postOps, + const MemoryDescArgs& descs, + const std::string& implementationPriority = {}) { const auto& implementations = getImplementations(); std::vector suitableImplementations; const executor::Config config{descs, attrs, postOps}; @@ -244,51 +150,17 @@ class ExecutorFactory { return suitableImplementations; } - size_t select(const MemoryArgs& memory, const size_t startIdx) const { - OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(), - "Failed to find an implementation since start indx: ", startIdx, - " is out of range of the suitable implementations array: ", m_suitableImplementations.size()); - auto startIt = m_suitableImplementations.begin(); - std::advance(startIt, startIdx); - const auto selectedImplementation = - std::find_if(startIt, - m_suitableImplementations.end(), - [&memory](const ExecutorImplementationRef& implementation) { - return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory); - }); - OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation"); - - return std::distance(m_suitableImplementations.begin(), selectedImplementation); - } - - ExecutorPtr create(const size_t implId, - const MemoryArgs& memory, - const ExecutorContext::CPtr context) { - assert(implId < m_executors.size() && implId < m_suitableImplementations.size()); - - if (!m_executors[implId]) { - const auto& impl = m_suitableImplementations[implId].get(); - m_executors[implId] = impl.create(m_attrs, m_postOps, memory, context); - } - - return m_executors[implId]; - } - const Attrs& m_attrs; const PostOps& m_postOps; const ExecutorContext::CPtr m_context; std::vector m_suitableImplementations; - // stores fallback status to avoid performing the check for every make() call - std::vector m_implementationRequiresFallback; - // executors cache - std::vector m_executors; }; -template -using ExecutorFactoryPtr = std::shared_ptr>; +template +using ExecutorFactoryPtr = std::shared_ptr>; -template -using ExecutorFactoryCPtr = std::shared_ptr>; +template +using ExecutorFactoryCPtr = std::shared_ptr>; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp index 3459d1fe35e19e..07a58b0fa6cfa7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor_implementation.hpp @@ -19,22 +19,22 @@ template class ExecutorImplementation { public: using SupportsPredicate = std::function&)>; - using RequiresFallbackPredicate = std::function>(const executor::Config&)>; + using RequiresFallbackPredicate = + std::function>(const executor::Config&)>; using AcceptsShapePredicate = std::function; using CreateFunction = std::function; - ExecutorImplementation( - const char* name, - const ExecutorType type, - const OperationType operationType, - const ShapeTolerance shapeRelation, - SupportsPredicate supports, - RequiresFallbackPredicate requiresFallback, - AcceptsShapePredicate acceptsShape, - CreateFunction create) + ExecutorImplementation(const char* name, + const ExecutorType type, + const OperationType operationType, + const ShapeTolerance shapeRelation, + SupportsPredicate supports, + RequiresFallbackPredicate requiresFallback, + AcceptsShapePredicate acceptsShape, + CreateFunction create) : m_name(name), m_type(type), m_operationType(operationType), diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp index ad6479597c6971..1699a845a3314b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp @@ -19,13 +19,8 @@ struct FCAttrs { bool withBias = false; bool weightsNonTransposed = false; bool sparseWeights = false; - // @todo only memory descriptors should be a part of attributes - // actual memory should be passed into "execute" or "prepareMemory" calls - std::vector dequantizationScales; - // @todo should be passed as an additional memory input? - MemoryCPtr decompressionSubtractPtr; - MemoryCPtr decompressionMultiplyPtr; uint64_t dynamicQuantizationGroupSize; + ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 5834c3dda4b262..42101ce3fca257 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -12,12 +12,13 @@ #include "nodes/executors/convolution_config.hpp" #include "nodes/executors/dnnl/dnnl_convolution_primitive.hpp" #include "nodes/executors/dnnl/dnnl_fullyconnected.hpp" +#include "nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp" #include "nodes/executors/dnnl/dnnl_matmul_primitive.hpp" #include "nodes/executors/dnnl/dnnl_shape_agnostic_data.hpp" #include "nodes/executors/executor.hpp" #include "nodes/executors/executor_implementation.hpp" -#include "nodes/executors/implementations.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "nodes/executors/implementations.hpp" #include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/mlas/mlas_gemm.hpp" #include "nodes/executors/precision_matcher.hpp" @@ -29,7 +30,7 @@ #include "utils/debug_capabilities.h" #if defined(OV_CPU_WITH_ACL) -#include "nodes/executors/acl/acl_fullyconnected.hpp" +# include "nodes/executors/acl/acl_fullyconnected.hpp" #endif #if defined(OV_CPU_WITH_SHL) @@ -49,7 +50,7 @@ using LayoutConfig = std::vector; static const LayoutConfig dnnlFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}; static const LayoutConfig aclFCLayoutConfig{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}; -template +template struct Require { bool operator()() { return dnnl::impl::cpu::x64::mayiuse(ISA); @@ -143,10 +144,10 @@ static bool fullyMatchConfiguration(const MemoryDescArgs& currentDescriptors, continue; if (desc->getPrecision() != type) - return false; // type mismatch + return false; // type mismatch if (!desc->hasLayoutType(layoutConfig[i])) - return false; // layout mismatch + return false; // layout mismatch } return true; @@ -206,6 +207,8 @@ OV_CPU_MAYBE_UNUSED_FUNCTION static inline bool noPostOps(const FCConfig& config return config.postOps.empty(); } +// to keep OV_CPU_INSTANCE macros aligned +// clang-format off template <> const std::vector>& getImplementations() { static const std::vector> fullyconnectedImplementations { @@ -440,8 +443,7 @@ const std::vector>& getImplementations() { const ExecutorContext::CPtr context, std::shared_ptr shareAgnosticData) const { MatMulAttrs matMulAttrs{false, - false, - attrs.dequantizationScales}; + false}; auto primitive = DefaultInstantiator{}( memory, @@ -492,5 +494,7 @@ const std::vector>& getImplementations() { return fullyconnectedImplementations; } +// clang-format on + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp index 6aad18c793c8cf..347ac4c981f4f1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/graph_emitter.hpp @@ -5,12 +5,11 @@ #pragma once #include -#include #include "graph.h" -#include "memory_desc/cpu_memory_desc.h" #include "node.h" #include "nodes/executors/executor.hpp" +#include "nodes/executors/executor_config.hpp" #include "post_ops.hpp" namespace ov { @@ -72,6 +71,47 @@ class GraphEmitter { return graph; } + static MemoryDescArgs memoryDescsFromMemory(const MemoryArgs& memory) { + MemoryDescArgs memoryDescs; + memoryDescs.reserve(memory.size()); + + for (const auto& mem : memory) { + memoryDescs[mem.first] = mem.second->getDescPtr(); + } + + return memoryDescs; + } + + static executor::Config createConfig(const MemoryArgs& memory, const Attrs& attrs, const PostOps& postOps) { + return executor::Config{memoryDescsFromMemory(memory), attrs, postOps}; + } + + static ExecutorPtr fallback(const executor::Config& config, + const executor::Config& fallbackConfig, + const MemoryArgs& memory, + const ExecutorContext::CPtr context, + const std::string& name) { + DEBUG_LOG("Falling back to graph executor for ", + name, + ". Original config: ", + config, + " new config:", + fallbackConfig); + + GraphEmitter graphEmitter(config.descs, config.attrs, config.postOps, memory, context, name); + + const auto& graphExecutor = + graphEmitter.createGraph(fallbackConfig.descs, fallbackConfig.attrs, fallbackConfig.postOps, context) + .ensureAttrsMatch() + .ensureSrcDescsMatch() + .ensureDstDescsMatch() + .ensurePostOpsMatch() + .emit(); + (void)graphExecutor; + + OPENVINO_THROW("Fallback logic is not implemented yet"); // return graphExecutor; + } + private: const MemoryDescArgs& descs; const Attrs& attrs; diff --git a/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp index cd029283a09c50..bee82af305c9d2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/implementation_utils.hpp @@ -5,6 +5,7 @@ #pragma once #include + #include "cpu_types.h" #include "memory_desc/cpu_memory_desc.h" #include "nodes/executors/memory_arguments.hpp" @@ -13,80 +14,80 @@ namespace ov { namespace intel_cpu { -template +template ov::element::Type memoryDescType(const Config& config) { return config.descs.at(idx)->getPrecision(); } -template +template ov::element::Type srcType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type weiType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type biaType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type dstType(const Config& config) { return memoryDescType(config); } -template +template ov::element::Type dims(const Config& config) { return config.descs.at(idx)->getShape().getDims(); } -template +template const VectorDims& srcDims(const Config& config) { return dims(config); } -template +template const VectorDims& weiDims(const Config& config) { return dims(config); } -template +template size_t rank(const Config& config) { return config.descs.at(idx)->getShape().getRank(); } -template +template size_t srcRank(const Config& config) { return rank(config); } -template +template size_t weiRank(const Config& config) { return rank(config); } -template +template size_t memSize(const Config& config) { return config.descs.at(idx)->getCurrentMemSize(); } -template +template size_t srcMemSize(const Config& config) { return memSize(config); } -template +template size_t weiMemSize(const Config& config) { return memSize(config); } -template +template size_t postOpsNumbers(const Config& config) { return config.postOps.size(); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp index d0a006b1bea0fa..cb830a36f03cb1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate.cpp @@ -3,18 +3,19 @@ // #include "interpolate.hpp" -#include "openvino/core/parallel.hpp" -#include "nodes/common/cpu_memcpy.h" + #include "emitters/plugin/x64/jit_load_store_emitters.hpp" +#include "nodes/common/cpu_memcpy.h" +#include "openvino/core/parallel.hpp" using namespace ov::intel_cpu; bool ov::intel_cpu::InterpolateExecutor::init(const InterpolateAttrs& interpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { - const auto &srcDims = srcDescs[0]->getShape().getStaticDims(); - const auto &dstDims = dstDescs[0]->getShape().getStaticDims(); + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { + const auto& srcDims = srcDescs[0]->getShape().getStaticDims(); + const auto& dstDims = dstDescs[0]->getShape().getStaticDims(); interpAttrs = interpolateAttrs; srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpolateAttrs.padBegin, interpolateAttrs.padEnd)); dstDim5d = to5Dim(dstDims); @@ -24,38 +25,49 @@ bool ov::intel_cpu::InterpolateExecutor::init(const InterpolateAttrs& interpolat spatialDimSize = getSpatialDimsNum(dataRank); switch (interpAttrs.mode) { - case InterpolateMode::nearest: { - buildTblNN(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout, interpolateAttrs.nearestMode); - break; - } - case InterpolateMode::linear_onnx: { - buildTblLinearOnnx(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout); - break; - } - case InterpolateMode::linear: { - static constexpr int LINEAR_KERNEL = 2; - buildTblLinear(srcDimPad5d, dstDim5d, interpAttrs.dataScales, LINEAR_KERNEL, interpolateAttrs.antialias); - break; - } - case InterpolateMode::cubic: { - buildTblCubic(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.cubeCoeff, interpolateAttrs.layout); - break; - } - default: { - OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", interpAttrs.mode); - break; - } + case InterpolateMode::nearest: { + buildTblNN(srcDimPad5d, + dstDim5d, + interpAttrs.dataScales, + interpolateAttrs.layout, + interpolateAttrs.nearestMode); + break; + } + case InterpolateMode::linear_onnx: { + buildTblLinearOnnx(srcDimPad5d, dstDim5d, interpAttrs.dataScales, interpolateAttrs.layout); + break; + } + case InterpolateMode::linear: { + static constexpr int LINEAR_KERNEL = 2; + buildTblLinear(srcDimPad5d, dstDim5d, interpAttrs.dataScales, LINEAR_KERNEL, interpolateAttrs.antialias); + break; + } + case InterpolateMode::cubic: { + buildTblCubic(srcDimPad5d, + dstDim5d, + interpAttrs.dataScales, + interpolateAttrs.cubeCoeff, + interpolateAttrs.layout); + break; + } + default: { + OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", interpAttrs.mode); + break; + } } return true; } // ===================================================================================================================== // index layout: // d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1 -void ov::intel_cpu::InterpolateExecutor::buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) { +void ov::intel_cpu::InterpolateExecutor::buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode) { const int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; - float fy = dataScales[dimSize - 2]; + float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; size_t ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; @@ -84,80 +96,91 @@ void ov::intel_cpu::InterpolateExecutor::buildTblNN(const VectorDims& srcDimPad5 // scale is float(outShape) / float(inShape) // strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline // the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode -float ov::intel_cpu::InterpolateExecutor::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const { +float ov::intel_cpu::InterpolateExecutor::coordTransToInput(int outCoord, + float scale, + int inShape, + int outShape) const { if (scale == 1.0f || (inShape == outShape)) { return outCoord; } switch (interpAttrs.coordTransMode) { - case InterpolateCoordTransMode::half_pixel: { + case InterpolateCoordTransMode::half_pixel: { + return (outCoord + 0.5f) / scale - 0.5f; + break; + } + case InterpolateCoordTransMode::pytorch_half_pixel: { + if (outShape > 1) return (outCoord + 0.5f) / scale - 0.5f; - break; - } - case InterpolateCoordTransMode::pytorch_half_pixel: { - if (outShape > 1) - return (outCoord + 0.5f) / scale - 0.5f; - else - return 0; - break; - } - case InterpolateCoordTransMode::asymmetric: { - return static_cast(outCoord) / scale; - break; - } - case InterpolateCoordTransMode::tf_half_pixel_for_nn: { - return (outCoord + 0.5f) / scale; - break; - } - case InterpolateCoordTransMode::align_corners: { - if (outShape > 1) - return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); - else - return 0; - break; - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); - break; - } + else + return 0; + break; + } + case InterpolateCoordTransMode::asymmetric: { + return static_cast(outCoord) / scale; + break; + } + case InterpolateCoordTransMode::tf_half_pixel_for_nn: { + return (outCoord + 0.5f) / scale; + break; + } + case InterpolateCoordTransMode::align_corners: { + if (outShape > 1) + return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); + else + return 0; + break; + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); + break; + } } } -int ov::intel_cpu::InterpolateExecutor::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const { +int ov::intel_cpu::InterpolateExecutor::nearestRound(float originCoord, + bool isDownsample, + InterpolateNearestMode nearestMode) const { switch (nearestMode) { - case InterpolateNearestMode::round_prefer_floor: { - if (originCoord == (static_cast(originCoord) + 0.5f)) - return static_cast(std::floor(originCoord)); - else - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::round_prefer_ceil: { - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::floor: { + case InterpolateNearestMode::round_prefer_floor: { + if (originCoord == (static_cast(originCoord) + 0.5f)) return static_cast(std::floor(originCoord)); - break; - } - case InterpolateNearestMode::ceil: { + else + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::round_prefer_ceil: { + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::floor: { + return static_cast(std::floor(originCoord)); + break; + } + case InterpolateNearestMode::ceil: { + return static_cast(std::ceil(originCoord)); + break; + } + case InterpolateNearestMode::simple: { + if (isDownsample) return static_cast(std::ceil(originCoord)); - break; - } - case InterpolateNearestMode::simple: { - if (isDownsample) - return static_cast(std::ceil(originCoord)); - else - return static_cast(originCoord); - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); - break; - } + else + return static_cast(originCoord); + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); + break; + } } } -void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, int inShape, int outShape, - int& index0, int& index1, float& weight0, float& weight1) { +void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1) { float inCoord = coordTransToInput(outCoord, scale, inShape, outShape); inCoord = std::max(0.0f, std::min(inCoord, static_cast(inShape - 1))); index0 = std::min(static_cast(inCoord), inShape - 1); @@ -171,8 +194,10 @@ void ov::intel_cpu::InterpolateExecutor::linearOnnxCF(int outCoord, float scale, } } -void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout) { +void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout) { int dimSize = dataRank; float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f; float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f; @@ -231,7 +256,7 @@ void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& sr indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale; weightPtr[0][idxOzOyOx] = weightL; weightPtr[1][idxOzOyOx] = weightR; - if (spatialDimSize > 1) { + if (spatialDimSize > 1) { indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale; indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale; weightPtr[2][idxOzOyOx] = weightT; @@ -284,8 +309,11 @@ void ov::intel_cpu::InterpolateExecutor::buildTblLinearOnnx(const VectorDims& sr // wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw // | | // wh0.....wh_diameter ih0.....ih_diameter -void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, int kernel_width, bool antialias) { +void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, + bool antialias) { int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; float fy = dataScales[dimSize - 2]; @@ -309,15 +337,15 @@ void ov::intel_cpu::InterpolateExecutor::buildTblLinear(const VectorDims& srcDim int sizeOH = OH * diaOH; int sizeOW = OW * diaOW; indexTable.resize((sizeOD + sizeOH + sizeOW) * 2); - float *weightTable = reinterpret_cast(&indexTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + float* weightTable = reinterpret_cast(&indexTable[0]); + float* weightOD = static_cast(&weightTable[0]); + float* weightOH = static_cast(&weightTable[sizeOD]); + float* weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - int *idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + int* idxTable = static_cast(&indexTable[sizeOD + sizeOH + sizeOW]); + int* idxOD = static_cast(&idxTable[0]); + int* idxOH = static_cast(&idxTable[sizeOD]); + int* idxOW = static_cast(&idxTable[sizeOD + sizeOH]); for (int oz = 0; oz < static_cast(OD); oz++) { float iz = coordTransToInput(oz, fz, ID, OD); @@ -375,8 +403,11 @@ std::vector ov::intel_cpu::InterpolateExecutor::getCubicCoeffs(float mant // table layout: // OW OW OW OW OW OH OH OH OH OH // x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3 -void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout) { +void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout) { int dimSize = dataRank; float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; @@ -394,9 +425,9 @@ void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimP } int tblAdvance = 0; - int *xOrigin = static_cast(&indexTable[tblAdvance]); + int* xOrigin = static_cast(&indexTable[tblAdvance]); tblAdvance += OW; - float *xFactor = reinterpret_cast(&indexTable[tblAdvance]); + float* xFactor = reinterpret_cast(&indexTable[tblAdvance]); for (int ox = 0; ox < OW; ox++) { float ix = coordTransToInput(ox, fx, IW, OW); int ix_r = static_cast(std::floor(ix)); @@ -410,9 +441,9 @@ void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimP } tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&indexTable[tblAdvance]); + int* yOrigin = static_cast(&indexTable[tblAdvance]); tblAdvance += OH; - float *yFactor = reinterpret_cast(&indexTable[tblAdvance]); + float* yFactor = reinterpret_cast(&indexTable[tblAdvance]); for (int oy = 0; oy < OH; oy++) { float iy = coordTransToInput(oy, fy, IH, OH); int iy_r = static_cast(std::floor(iy)); @@ -427,9 +458,9 @@ void ov::intel_cpu::InterpolateExecutor::buildTblCubic(const VectorDims& srcDimP if (layout == InterpolateLayoutType::planar) { tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&indexTable[tblAdvance]); + int* sequenceOH = static_cast(&indexTable[tblAdvance]); tblAdvance += OH * OW; - int *sequenceOW = static_cast(&indexTable[tblAdvance]); + int* sequenceOW = static_cast(&indexTable[tblAdvance]); for (int h = 0; h < OH; ++h) { int offset = h * OW; for (int w = 0; w < OW; ++w) { @@ -447,16 +478,17 @@ inline VectorDims getBlockND(const VectorDims& shape) { int shapeRank = shape.size(); VectorDims blockND(shapeRank + 1, 1); for (int i = shapeRank - 1; i >= 0; i--) { - blockND[i] = shape[i] * blockND[i+1]; + blockND[i] = shape[i] * blockND[i + 1]; } return blockND; } -const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vector& src, const std::vector& dst) { - const uint8_t *src_data_origin = src[0]->getDataAs(); +const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vector& src, + const std::vector& dst) { + const uint8_t* src_data_origin = src[0]->getDataAs(); - const auto &srcDim = src[0]->getStaticDims(); - const auto &dstDim = dst[0]->getStaticDims(); + const auto& srcDim = src[0]->getStaticDims(); + const auto& dstDim = dst[0]->getStaticDims(); size_t dimSize = srcDim.size(); auto srcDimPad = getSrcDimPad5d(); @@ -465,7 +497,7 @@ const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vect const auto dstDim5d = to5Dim(dstDim); const auto srcDataSize = src[0]->getDesc().getPrecision().size(); - const uint8_t *src_data = nullptr; + const uint8_t* src_data = nullptr; std::vector srcPadded; if (interpAttrs.hasPad) { int padB0 = (dimSize > 2) ? interpAttrs.padBegin[0] : 0; @@ -479,23 +511,32 @@ const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vect if (interpAttrs.layout == InterpolateLayoutType::planar) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + - inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize; + const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + + inShapeBlock[3] * d + inShapeBlock[4] * h) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + + inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); }); src_data = src_data_pad; } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + - (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) + - inShapePadBlock[4] * (h + padB3) + - inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize; + const uint8_t* src = src_data_origin + + (inShapeBlock[1] * n + + (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * + srcDataSize; + uint8_t* srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + + (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + + inShapePadBlock[5] * (w + padB4)) * + srcDimPad5d[1] + + padB1) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); }); src_data = src_data_pad; @@ -504,23 +545,30 @@ const uint8_t* ov::intel_cpu::InterpolateExecutor::padPreprocess(const std::vect size_t CB = div_up(srcDimPad5d[1], blkSize); size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize; srcPadded.resize(eltsTotal * srcDataSize, 0x0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) { OPENVINO_THROW("Interpolate layer with name does not support padding on batch and channel dimensions"); } - parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) { - const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (h * srcDim5d[4] * blkSize) * srcDataSize - + (w * blkSize) * srcDataSize; - uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize - + ((w + padB4) * blkSize) * srcDataSize; - cpu_memcpy(srcPad, src, blkSize * srcDataSize); - }); + parallel_for5d( + srcDim5d[0], + CB, + srcDim5d[2], + srcDim5d[3], + srcDim5d[4], + [&](int n, int cb, int d, int h, int w) { + const uint8_t* src = src_data_origin + + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize; + uint8_t* srcPad = + src_data_pad + + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize + ((w + padB4) * blkSize) * srcDataSize; + cpu_memcpy(srcPad, src, blkSize * srcDataSize); + }); src_data = src_data_pad; } } else { diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp index 15df4eed5f0471..041589c0ab9f6a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate.hpp @@ -11,41 +11,15 @@ namespace ov { namespace intel_cpu { -enum InterpolateLayoutType { - planar, - block, - by_channel -}; +enum InterpolateLayoutType { planar, block, by_channel }; -enum InterpolateMode { - nearest, - linear, - linear_onnx, - cubic, - bilinear_pillow, - bicubic_pillow -}; +enum InterpolateMode { nearest, linear, linear_onnx, cubic, bilinear_pillow, bicubic_pillow }; -enum InterpolateCoordTransMode { - half_pixel, - pytorch_half_pixel, - asymmetric, - tf_half_pixel_for_nn, - align_corners -}; +enum InterpolateCoordTransMode { half_pixel, pytorch_half_pixel, asymmetric, tf_half_pixel_for_nn, align_corners }; -enum class InterpolateNearestMode { - round_prefer_floor, - round_prefer_ceil, - floor, - ceil, - simple -}; +enum class InterpolateNearestMode { round_prefer_floor, round_prefer_ceil, floor, ceil, simple }; -enum class InterpolateShapeCalcMode { - sizes, - scales -}; +enum class InterpolateShapeCalcMode { sizes, scales }; struct InterpolateAttrs { InterpolateShapeCalcMode shapeCalcMode = InterpolateShapeCalcMode::sizes; @@ -63,9 +37,9 @@ struct InterpolateAttrs { bool hasPad = false; }; -inline VectorDims getPaddedInputShape(const VectorDims &srcDims, - const std::vector &padBegin, - const std::vector &padEnd) { +inline VectorDims getPaddedInputShape(const VectorDims& srcDims, + const std::vector& padBegin, + const std::vector& padEnd) { VectorDims paddedShape; int dataRank = srcDims.size(); for (int i = 0; i < dataRank; i++) { @@ -80,16 +54,16 @@ inline int clipCoord(int pos, int length) { inline size_t getSpatialDimsNum(const Dim rank) { switch (rank) { - case 1: - case 3: - return 1; - case 2: - case 4: - return 2; - case 5: - return 3; - default: - OPENVINO_THROW("Can't define number spatial"); + case 1: + case 3: + return 1; + case 2: + case 4: + return 2; + case 5: + return 3; + default: + OPENVINO_THROW("Can't define number spatial"); } } @@ -133,27 +107,49 @@ class InterpolateExecutor { virtual bool init(const InterpolateAttrs& interpolateAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr); - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + const dnnl::primitive_attr& attr); + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual impl_desc_type getImplType() const = 0; virtual ~InterpolateExecutor() = default; - VectorDims getSrcDimPad5d() const { return srcDimPad5d; } + VectorDims getSrcDimPad5d() const { + return srcDimPad5d; + } const uint8_t* padPreprocess(const std::vector& src, const std::vector& dst); private: - void buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout, InterpolateNearestMode nearestMode); - void buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, + void buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode); + void buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, InterpolateLayoutType layout); - void buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, int kernel_width, + void buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, bool antialias); - void buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, float cubicCoeff, + void buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, InterpolateLayoutType layout); float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; - void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1); + void linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1); std::vector getCubicCoeffs(float mantissa, float a); protected: @@ -180,5 +176,5 @@ class InterpolateExecutorBuilder { using InterpolateExecutorBuilderPtr = std::shared_ptr; using InterpolateExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp index 2362b644583763..21ae249757bf9c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getInterpolateExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp index 2ed16ea04b1852..a0c1fc240731fb 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/interpolate_list.hpp @@ -5,14 +5,13 @@ #pragma once #include "executor.hpp" - #include "interpolate.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_interpolate.hpp" +# include "acl/acl_interpolate.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -27,9 +26,10 @@ const std::vector& getInterpolateExecutorsList(); class InterpolateExecutorFactory : public ExecutorFactoryLegacy { public: InterpolateExecutorFactory(const InterpolateAttrs& InterpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getInterpolateExecutorsList()) { if (desc.builder->isSupported(InterpolateAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -39,9 +39,9 @@ class InterpolateExecutorFactory : public ExecutorFactoryLegacy { ~InterpolateExecutorFactory() = default; virtual InterpolateExecutorPtr makeExecutor(const InterpolateAttrs& interpolateAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const InterpolateExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(interpolateAttrs, srcDescs, dstDescs, attr)) { @@ -52,7 +52,6 @@ class InterpolateExecutorFactory : public ExecutorFactoryLegacy { return ptr; }; - if (chosenDesc) { if (auto executor = build(chosenDesc)) { return executor; @@ -81,5 +80,5 @@ class InterpolateExecutorFactory : public ExecutorFactoryLegacy { using InterpolateExecutorFactoryPtr = std::shared_ptr; using InterpolateExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp index 9e484b24a2940e..e42bf3138bce91 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp @@ -12,7 +12,6 @@ namespace intel_cpu { struct MatMulAttrs { bool transposeA; bool transposeB; - std::vector dequantizationScales; }; using MatMulConfig = executor::Config; diff --git a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp index c04ca39e845ee1..05c3cf0d5df259 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp @@ -12,9 +12,9 @@ namespace ov { namespace intel_cpu { using MemoryDescArgs = std::unordered_map; -using MemoryArgs = std::unordered_map; +using MemoryArgs = std::unordered_map; -// @todo add more options +// basic inputs #define ARG_SRC_0 1 #define ARG_SRC ARG_SRC_0 #define ARG_SRC_1 2 @@ -24,6 +24,12 @@ using MemoryArgs = std::unordered_map; #define ARG_WEI_0 33 #define ARG_WEI ARG_WEI_0 #define ARG_BIAS 41 +// legacy dequantization scale +#define ARG_DST_DEQ_SCALE 53 +// scaling factors provided at execution time +#define ARG_ATTR_SCALES 4096 +// zero points provided at execution time +#define ARG_ATTR_ZERO_POINTS 8192 } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp index a03bfe2649413a..7e50c8086789a0 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp @@ -23,6 +23,10 @@ using namespace executor; using namespace dnnl; using namespace ov::element; +static Dim batchDim(const VectorDims& dims) { + return std::accumulate(dims.begin(), dims.end() - 1, 1, std::multiplies()); +} + static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, const ExecutorContext::CPtr context, const bool weightsTransposed) { @@ -31,14 +35,15 @@ static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, // Weights are transposed by MatMulConstTransposesExtraction // K is the IC of weight // the weight is reshaped to [-1, K] in ConvertMatMulToFC - const auto K = wgtDims[1]; - const auto N = wgtDims[0]; + Dim K = wgtDims.back(); + Dim N = batchDim(wgtDims); auto packedBsize = mlas_sgemm_pack_get_size(N, K); auto create = [&]() { float* weightPtr = weightsMemory->getDataAs(); size_t ldb = weightsTransposed ? K : N; + MemoryPtr _ptr = std::make_shared(context->getEngine(), intel_cpu::CpuBlockedMemoryDesc(i8, intel_cpu::Shape{packedBsize})); float* prepackedDst = _ptr->getDataAs(); @@ -66,21 +71,10 @@ bool MlasGemmExecutor::supports(const FCConfig& config) { DEBUG_LOG("MlasGemmExecutor: PostOps are not supported"); return false; } - const auto& weiDesc = config.descs.at(ARG_WEI); - const auto& dstDesc = config.descs.at(ARG_DST); - // MLAS cannot support weight dims > 2, e.g. [1,64,9,9] * [10,64,9,9] - const auto& weightsDims = weiDesc->getShape().getStaticDims(); - if (weightsDims.size() > 2) { - if (!std::all_of(weightsDims.begin() + 2, weightsDims.end(), [](const Dim dim) { - return dim == 1; - })) { - DEBUG_LOG("MlasGemmExecutor: weights dims > 2 are not supported"); - return false; - } - } + const auto& dstDesc = config.descs.at(ARG_DST); - if (config.attrs.withBias) { + if (!config.descs.at(ARG_BIAS)->empty()) { const auto& biaDesc = config.descs.at(ARG_BIAS); const auto& biasDims = biaDesc->getShape().getStaticDims(); const auto& outDims = dstDesc->getShape().getDims(); @@ -108,24 +102,16 @@ MlasGemmExecutor::MlasGemmExecutor(const FCAttrs& attrs, const ExecutorContext::CPtr context) : m_attrs(attrs), m_memoryArgs(memory), - packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)) {} + packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)), + N(batchDim(memory.at(ARG_WEI)->getStaticDims())), + K(memory.at(ARG_WEI)->getStaticDims().back()) {} bool MlasGemmExecutor::update(const MemoryArgs& memory) { - const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); - const auto& wgtDims = weiDesc->getShape().getStaticDims(); - // Weights are transposed by MatMulConstTransposesExtraction - // K is the IC of weight - // the weight is reshaped to [-1, K] in ConvertMatMulToFC - K = wgtDims[1]; - N = wgtDims[0]; const auto& outDims = dstDesc->getShape().getStaticDims(); - if (outDims.size() > 2) { - M = std::accumulate(outDims.begin(), outDims.end() - 1, 1, std::multiplies()); - } else { - M = outDims[0]; - } + M = outDims.size() > 2 ? batchDim(outDims) : outDims[0]; + return true; } diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp index 678fe5a5c22176..2b8b71bfbced0b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp @@ -3,9 +3,10 @@ // #include "mlas_transpose.hpp" -#include "openvino/core/parallel.hpp" -#include "nodes/common/cpu_memcpy.h" + #include "mlas.h" +#include "nodes/common/cpu_memcpy.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -24,7 +25,12 @@ struct has_mlas_transpose : std::true_type {}; template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisOutwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_writers, + int64_t writes_per_loop, + int64_t writes_per_writer_per_loop) { const T* end; for (int64_t l = 0; l < num_loops; ++l) { T* output_for_first_writer = output_data; @@ -44,9 +50,17 @@ typename std::enable_if::value, void>::type SimpleTranspo template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisOutwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_writers, + int64_t writes_per_loop, + int64_t writes_per_writer_per_loop) { for (int64_t l = 0; l < num_loops; ++l) { - MlasTranspose(input_data, output_data, static_cast(writes_per_writer_per_loop), static_cast(num_writers)); + MlasTranspose(input_data, + output_data, + static_cast(writes_per_writer_per_loop), + static_cast(num_writers)); input_data += writes_per_loop; output_data += writes_per_loop; } @@ -54,7 +68,12 @@ typename std::enable_if::value, void>::type SimpleTranspos template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisInwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_readers, + int64_t reads_per_loop, + int64_t reads_per_reader_per_loop) { T* end; for (int64_t l = 0; l < num_loops; ++l) { const T* input_for_first_reader = input_data; @@ -74,9 +93,17 @@ typename std::enable_if::value, void>::type SimpleTranspo template typename std::enable_if::value, void>::type SimpleTransposeSingleAxisInwards( - const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) { + const T* input_data, + T* output_data, + int64_t num_loops, + int64_t num_readers, + int64_t reads_per_loop, + int64_t reads_per_reader_per_loop) { for (int64_t l = 0; l < num_loops; ++l) { - MlasTranspose(input_data, output_data, static_cast(num_readers), static_cast(reads_per_reader_per_loop)); + MlasTranspose(input_data, + output_data, + static_cast(num_readers), + static_cast(reads_per_reader_per_loop)); input_data += reads_per_loop; output_data += reads_per_loop; } @@ -148,7 +175,10 @@ bool MlasTransposeExecutor::IsTransposeMovingSingleAxis(VectorDims permutations, return single_axis_moved; } -void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) { +void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, + const MemoryPtr& output, + size_t from, + size_t to) { const auto& input_shape = input->getShape(); const auto& input_dims = input_shape.getDims(); const auto element_size = input->getDesc().getPrecision().size(); @@ -165,52 +195,68 @@ void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, const size_t bytes_per_write = static_cast(block_size) * element_size; switch (bytes_per_write) { - case (sizeof(uint8_t)): { - SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop, - writes_per_writer_per_loop); - break; - } - case (sizeof(uint16_t)): { - SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_writers, - writes_per_loop, writes_per_writer_per_loop); - break; - } - case (sizeof(uint32_t)): { - SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_writers, - writes_per_loop, writes_per_writer_per_loop); - break; - } - case (sizeof(uint64_t)): { - SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_writers, - writes_per_loop, writes_per_writer_per_loop); - break; - } - default: { - // we need to use memcpy for each block - for (int64_t l = 0; l < num_loops; ++l) { - uint8_t* output_for_first_writer = output_data; + case (sizeof(uint8_t)): { + SimpleTransposeSingleAxisOutwards(input_data, + output_data, + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + case (sizeof(uint16_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + case (sizeof(uint32_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + case (sizeof(uint64_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_writers, + writes_per_loop, + writes_per_writer_per_loop); + break; + } + default: { + // we need to use memcpy for each block + for (int64_t l = 0; l < num_loops; ++l) { + uint8_t* output_for_first_writer = output_data; - for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) { - uint8_t* output_for_current_writer = output_for_first_writer; + for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) { + uint8_t* output_for_current_writer = output_for_first_writer; - for (uint64_t w = 0; w < num_writers; ++w) { - memcpy(output_for_current_writer, input_data, bytes_per_write); - // skip to output position for next writer - output_for_current_writer += (writes_per_writer_per_loop * bytes_per_write); - input_data += bytes_per_write; - } - output_for_first_writer += bytes_per_write; + for (uint64_t w = 0; w < num_writers; ++w) { + memcpy(output_for_current_writer, input_data, bytes_per_write); + // skip to output position for next writer + output_for_current_writer += (writes_per_writer_per_loop * bytes_per_write); + input_data += bytes_per_write; } - output_data += writes_per_loop * bytes_per_write; + output_for_first_writer += bytes_per_write; } + output_data += writes_per_loop * bytes_per_write; } } + } } -void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) { +void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, + const MemoryPtr& output, + size_t from, + size_t to) { const auto& input_shape = input->getShape(); const auto& input_dims = input_shape.getDims(); @@ -227,61 +273,74 @@ void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, const size_t bytes_per_read = static_cast(block_size) * element_size; switch (bytes_per_read) { - case (sizeof(uint8_t)): { - SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - case (sizeof(uint16_t)): { - SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - case (sizeof(uint32_t)): { - SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - case (sizeof(uint64_t)): { - SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), - reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, - reads_per_reader_per_loop); - break; - } - default: { - // we need to use memcpy for each block - for (int64_t l = 0; l < num_loops; ++l) { - const uint8_t* input_for_first_reader = input_data; - for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) { - const uint8_t* input_for_current_reader = input_for_first_reader; - for (uint64_t r = 0; r < num_readers; ++r) { - memcpy(output_data, input_for_current_reader, bytes_per_read); - output_data += bytes_per_read; - // skip to input position for next reader - input_for_current_reader += (reads_per_reader_per_loop * bytes_per_read); - } - input_for_first_reader += bytes_per_read; + case (sizeof(uint8_t)): { + SimpleTransposeSingleAxisInwards(input_data, + output_data, + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint16_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint32_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint64_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), + num_loops, + num_readers, + reads_per_loop, + reads_per_reader_per_loop); + break; + } + default: { + // we need to use memcpy for each block + for (int64_t l = 0; l < num_loops; ++l) { + const uint8_t* input_for_first_reader = input_data; + for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) { + const uint8_t* input_for_current_reader = input_for_first_reader; + for (uint64_t r = 0; r < num_readers; ++r) { + memcpy(output_data, input_for_current_reader, bytes_per_read); + output_data += bytes_per_read; + // skip to input position for next reader + input_for_current_reader += (reads_per_reader_per_loop * bytes_per_read); } - input_data += reads_per_loop * bytes_per_read; + input_for_first_reader += bytes_per_read; } + input_data += reads_per_loop * bytes_per_read; } } + } } void MlasTransposeExecutor::exec(const std::vector& src, const std::vector& dst) { if (from > to) { - TransposeSingleAxisOutwards(src[0], dst[0], from, to); + TransposeSingleAxisOutwards(src[0], dst[0], from, to); } else { - TransposeSingleAxisInwards(src[0], dst[0], from, to); + TransposeSingleAxisInwards(src[0], dst[0], from, to); } } -bool MlasTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) { +bool MlasTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { if (!IsTransposeMovingSingleAxis(transposeParams.permuteParams.order, from, to)) { DEBUG_LOG("MLAS Transpose executor supports moving single axis only"); return false; @@ -292,8 +351,7 @@ bool MlasTransposeExecutor::init(const TransposeParams &transposeParams, bool MlasTransposeExecutorBuilder::isSupported(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs) const { - if (!srcDescs[0]->hasLayoutType(LayoutType::ncsp) || - !dstDescs[0]->hasLayoutType(LayoutType::ncsp)) { + if (!srcDescs[0]->hasLayoutType(LayoutType::ncsp) || !dstDescs[0]->hasLayoutType(LayoutType::ncsp)) { DEBUG_LOG("MLAS Transpose executor supports NCHW layout only"); return false; } @@ -308,5 +366,5 @@ TransposeExecutorPtr MlasTransposeExecutorBuilder::makeExecutor(const ExecutorCo return std::make_shared(context); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp index d7e0307414aac9..8f7cd1bf8c22bd 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp @@ -11,13 +11,16 @@ namespace intel_cpu { class MlasTransposeExecutor : public TransposeExecutor { public: using TransposeExecutor::TransposeExecutor; - bool init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, - const dnnl::primitive_attr &attr) override; - void exec(const std::vector &src, const std::vector &dst) override; + bool init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) override; + void exec(const std::vector& src, const std::vector& dst) override; + + impl_desc_type implType() const override { + return impl_desc_type::mlas; + } - impl_desc_type implType() const override { return impl_desc_type::mlas; } private: static int64_t calcShapeSize(const Shape& shape, size_t start, size_t end); static bool IsTransposeMovingSingleAxis(VectorDims permutations, size_t& from, size_t& to); @@ -37,5 +40,5 @@ class MlasTransposeExecutorBuilder : public TransposeExecutorBuilder { TransposeExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp b/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp index 9b522ed9887344..eec9d2a8947975 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn.cpp @@ -11,26 +11,34 @@ MVNExecutor::MVNExecutor(const ExecutorContext::CPtr context) : context(context) VectorDims MVNExecutor::transformTo5DCase(const VectorDims& shape, bool initAcrossChannels) { switch (shape.size()) { - // for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure. - // otherwise there are not enough data in spatial dimension to process in one kernel. - case 1 : // C - if (initAcrossChannels) { - return VectorDims({1, 1, 1, 1, shape[0]}); - } else { - return VectorDims({1, shape[0], 1, 1, 1}); - } - case 2 : // NC - if (initAcrossChannels) { - return VectorDims({1, shape[0], 1, shape[1], 1}); - } else { - return VectorDims({shape[0], shape[1], 1, 1, 1}); - } - case 3 : { return VectorDims({shape[0], shape[1], 1, shape[2], 1}); } - case 4 : { return VectorDims({shape[0], shape[1], 1, shape[2], shape[3]}); } - case 5 : { return VectorDims({shape[0], shape[1], shape[2], shape[3], shape[4]}); } - default : { OPENVINO_THROW("MVN executor doesn't support planar layout with rank: ", shape.size()); } + // for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure. + // otherwise there are not enough data in spatial dimension to process in one kernel. + case 1: // C + if (initAcrossChannels) { + return VectorDims({1, 1, 1, 1, shape[0]}); + } else { + return VectorDims({1, shape[0], 1, 1, 1}); + } + case 2: // NC + if (initAcrossChannels) { + return VectorDims({1, shape[0], 1, shape[1], 1}); + } else { + return VectorDims({shape[0], shape[1], 1, 1, 1}); + } + case 3: { + return VectorDims({shape[0], shape[1], 1, shape[2], 1}); + } + case 4: { + return VectorDims({shape[0], shape[1], 1, shape[2], shape[3]}); + } + case 5: { + return VectorDims({shape[0], shape[1], shape[2], shape[3], shape[4]}); + } + default: { + OPENVINO_THROW("MVN executor doesn't support planar layout with rank: ", shape.size()); + } } } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp b/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp index 759115a4b4b794..da51b5d1ef67e9 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp @@ -5,29 +5,22 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { -enum MVNLayoutType { - mvn_planar, - mvn_block, - mvn_by_channel -}; +enum MVNLayoutType { mvn_planar, mvn_block, mvn_by_channel }; // Defines way to add epsilon: inside sqrt or outside. -enum MVNEpsMode { - INSIDE_SQRT, - OUTSIDE_SQRT -}; +enum MVNEpsMode { INSIDE_SQRT, OUTSIDE_SQRT }; struct MVNAttrs { MVNLayoutType layout = mvn_planar; bool initAcrossChannels_ = false; bool execAcrossChannels_ = false; - bool normalizeVariance_ = false; + bool normalizeVariance_ = false; float epsValue_ = 0.0f; MVNEpsMode epsMode_ = INSIDE_SQRT; ov::element::Type src_prc; @@ -40,9 +33,11 @@ class MVNExecutor { virtual bool init(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual ~MVNExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -60,12 +55,14 @@ using MVNExecutorCPtr = std::shared_ptr; class MVNExecutorBuilder { public: ~MVNExecutorBuilder() = default; - virtual bool isSupported(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs) const = 0; + virtual bool isSupported(const MVNAttrs& mvnAttrs, + const std::vector& srcDescs, + const std::vector& dstDescs) const = 0; virtual MVNExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const = 0; }; using MVNExecutorBuilderPtr = std::shared_ptr; using MVNExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp index c27751b7a2d2b4..99a55d79f58177 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getMVNExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp index 3a8d3cc61fe585..82f8e868ac2d81 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn_list.hpp @@ -5,14 +5,13 @@ #pragma once #include "executor.hpp" - #include "mvn.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_mvn.hpp" +# include "acl/acl_mvn.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -29,7 +28,8 @@ class MVNExecutorFactory : public ExecutorFactoryLegacy { MVNExecutorFactory(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getMVNExecutorsList()) { if (desc.builder->isSupported(mvnAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -41,7 +41,7 @@ class MVNExecutorFactory : public ExecutorFactoryLegacy { virtual MVNExecutorPtr makeExecutor(const MVNAttrs& mvnAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const dnnl::primitive_attr& attr) { auto build = [&](const MVNExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(mvnAttrs, srcDescs, dstDescs, attr)) { @@ -80,5 +80,5 @@ class MVNExecutorFactory : public ExecutorFactoryLegacy { using MVNExecutorFactoryPtr = std::shared_ptr; using MVNExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp b/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp index e15d1a4ef15b8d..95448640e3b125 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { PoolingExecutor::PoolingExecutor(const ExecutorContext::CPtr context) : context(context) {} -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp b/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp index 5ea358c68afc8e..e826d3a37250db 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling.hpp @@ -5,8 +5,8 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -44,9 +44,11 @@ class PoolingExecutor { virtual bool init(const PoolingAttrs& poolingAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, std::unordered_map postOpsArgs) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + std::unordered_map postOpsArgs) = 0; virtual ~PoolingExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -71,5 +73,5 @@ class PoolingExecutorBuilder { using PoolingExecutorBuilderPtr = std::shared_ptr; using PoolingExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp index 4b130f37bfff57..d0ee9f7da574c6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getPoolingExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp index d6ce5489105b19..1c051ae7d2959d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/pooling_list.hpp @@ -5,10 +5,9 @@ #pragma once #include "executor.hpp" - #include "pooling.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_pooling.hpp" +# include "acl/acl_pooling.hpp" #endif namespace ov { @@ -24,9 +23,10 @@ const std::vector& getPoolingExecutorsList(); class PoolingExecutorFactory : public ExecutorFactoryLegacy { public: PoolingExecutorFactory(const PoolingAttrs& poolingAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getPoolingExecutorsList()) { if (desc.builder->isSupported(poolingAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -36,9 +36,9 @@ class PoolingExecutorFactory : public ExecutorFactoryLegacy { ~PoolingExecutorFactory() = default; virtual PoolingExecutorPtr makeExecutor(const PoolingAttrs& poolingAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const PoolingExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(poolingAttrs, srcDescs, dstDescs, attr)) { @@ -49,7 +49,6 @@ class PoolingExecutorFactory : public ExecutorFactoryLegacy { return ptr; }; - if (chosenDesc) { if (auto executor = build(chosenDesc)) { return executor; @@ -74,5 +73,5 @@ class PoolingExecutorFactory : public ExecutorFactoryLegacy { using PoolingExecutorFactoryPtr = std::shared_ptr; using PoolingExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp b/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp index 95044a9e205595..ced50dd2ec3dd5 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/precision_matcher.cpp @@ -15,9 +15,12 @@ namespace intel_cpu { bool match(const InOutTypeMask& patterns, const InOutTypes& values) { assert(patterns.size() == values.size()); - return std::equal(values.begin(), values.end(), patterns.begin(), [](const ov::element::Type value, const TypeMask pattern) { - return pattern & value; - }); + return std::equal(values.begin(), + values.end(), + patterns.begin(), + [](const ov::element::Type value, const TypeMask pattern) { + return pattern & value; + }); return true; } diff --git a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp index 73aac151843b08..36aab4f8fddc77 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.cpp @@ -14,7 +14,9 @@ namespace ov { namespace intel_cpu { -InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, const TypeMapping& mapping, const MappingNotation& notation) { +InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, + const TypeMapping& mapping, + const MappingNotation& notation) { InOutTypes types; std::transform(notation.begin(), notation.end(), std::back_inserter(types), [&descriptors](int id) { return descriptors.at(id)->getPrecision(); diff --git a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp index 374b584dd0ffb5..20e613eea2c236 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/precision_translation.hpp @@ -18,24 +18,21 @@ namespace intel_cpu { template struct use { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { assert(bypassId < types.size()); return types[bypassId]; } }; struct bypass { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { return types[idx]; } }; template struct just { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { // ignore everything (void)types; (void)idx; @@ -45,8 +42,7 @@ struct just { template <> struct just { - ov::element::Type operator()(const std::vector& types, - size_t idx) const { + ov::element::Type operator()(const std::vector& types, size_t idx) const { // ignore everything (void)types; (void)idx; @@ -58,11 +54,9 @@ using policy = std::function - PortsTranslation(Policies... policies) : - m_policies{policies...} {} + PortsTranslation(Policies... policies) : m_policies{policies...} {} - std::vector operator()( - const std::vector& types) const { + std::vector operator()(const std::vector& types) const { assert(types.size() == m_policies.size()); std::vector result; @@ -73,6 +67,7 @@ struct PortsTranslation { return result; } + private: std::vector m_policies; }; @@ -88,9 +83,7 @@ class TypeMappingEntry { public: using EnabledPredicate = std::function; - TypeMappingEntry(InOutTypeMask mask, - TypeTranslationFunction translation, - EnabledPredicate enabled = {}) + TypeMappingEntry(InOutTypeMask mask, TypeTranslationFunction translation, EnabledPredicate enabled = {}) : m_mask(std::move(mask)), m_translation(std::move(translation)), m_enabled(std::move(enabled)) {} @@ -121,7 +114,9 @@ using TypeMapping = std::vector; using MappingNotation = std::vector; using pt = PortsTranslation; -InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, const TypeMapping& mapping, const MappingNotation& notation); +InOutTypes getTypeConfiguration(const MemoryDescArgs& descriptors, + const TypeMapping& mapping, + const MappingNotation& notation); } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/printers.cpp b/src/plugins/intel_cpu/src/nodes/executors/printers.cpp index ac52b25a069541..1bce932225827d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/printers.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/printers.cpp @@ -4,25 +4,27 @@ #ifdef CPU_DEBUG_CAPS -#include -#include "printers.hpp" -#include "post_ops.hpp" -#include "fullyconnected_config.hpp" +# include "printers.hpp" + +# include + +# include "fullyconnected_config.hpp" +# include "post_ops.hpp" namespace ov { namespace intel_cpu { -std::ostream & operator<<(std::ostream & os, const FCAttrs& attrs) { +std::ostream& operator<<(std::ostream& os, const FCAttrs& attrs) { // @todo print Attrs return os; } -std::ostream & operator<<(std::ostream & os, const PostOps& postOps) { +std::ostream& operator<<(std::ostream& os, const PostOps& postOps) { // @todo print PostOps return os; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/nodes/executors/printers.hpp b/src/plugins/intel_cpu/src/nodes/executors/printers.hpp index d37ab633ba8036..7a96550b3f225c 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/printers.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/printers.hpp @@ -3,25 +3,27 @@ // #ifdef CPU_DEBUG_CAPS -#pragma once +# pragma once -#include -#include "executor_config.hpp" +# include + +# include "executor_config.hpp" namespace ov { namespace intel_cpu { namespace executor { -template struct Config; +template +struct Config; } struct FCAttrs; -std::ostream & operator<<(std::ostream & os, const FCAttrs& attrs); -std::ostream & operator<<(std::ostream & os, const PostOps& postOps); +std::ostream& operator<<(std::ostream& os, const FCAttrs& attrs); +std::ostream& operator<<(std::ostream& os, const PostOps& postOps); -template -std::ostream & operator<<(std::ostream & os, const executor::Config& config) { +template +std::ostream& operator<<(std::ostream& os, const executor::Config& config) { for (const auto& desc : config.descs) { const auto id = desc.first; const auto descPtr = desc.second; @@ -34,7 +36,7 @@ std::ostream & operator<<(std::ostream & os, const executor::Config& conf return os; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov -#endif // CPU_DEBUG_CAPS +#endif // CPU_DEBUG_CAPS diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp b/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp index 8e091f0282eb5d..6039813d8fdd28 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce.cpp @@ -9,5 +9,5 @@ namespace intel_cpu { ReduceExecutor::ReduceExecutor(const ExecutorContext::CPtr context) : context(context) {} -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp b/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp index 8aa6e8f0aaa4ac..21b730a197df3a 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce.hpp @@ -5,9 +5,9 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "dnnl_scratch_pad.h" #include "executor.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -24,9 +24,11 @@ class ReduceExecutor { virtual bool init(const ReduceAttrs& reduceAttrs, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; - virtual void exec(const std::vector& src, const std::vector& dst, const void *post_ops_data_) = 0; + virtual void exec(const std::vector& src, + const std::vector& dst, + const void* post_ops_data_) = 0; virtual ~ReduceExecutor() = default; virtual impl_desc_type getImplType() const = 0; @@ -51,5 +53,5 @@ class ReduceExecutorBuilder { using ReduceExecutorBuilderPtr = std::shared_ptr; using ReduceExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp index aec5c7eb905865..e6f035879a2cc6 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.cpp @@ -9,11 +9,10 @@ namespace intel_cpu { const std::vector& getReduceExecutorsList() { static std::vector descs = { - OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - }; + OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared())}; return descs; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp index ea2543a495e64c..faffdebc947c02 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/reduce_list.hpp @@ -5,14 +5,13 @@ #pragma once #include "executor.hpp" - #include "reduce.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_reduce.hpp" +# include "acl/acl_reduce.hpp" #endif -#include "onednn/iml_type_mapper.h" #include "common/primitive_cache.hpp" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -27,9 +26,10 @@ const std::vector& getReduceExecutorsList(); class ReduceExecutorFactory : public ExecutorFactoryLegacy { public: ReduceExecutorFactory(const ReduceAttrs& reduceAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { for (auto& desc : getReduceExecutorsList()) { if (desc.builder->isSupported(reduceAttrs, srcDescs, dstDescs)) { supportedDescs.push_back(desc); @@ -39,9 +39,9 @@ class ReduceExecutorFactory : public ExecutorFactoryLegacy { ~ReduceExecutorFactory() = default; virtual ReduceExecutorPtr makeExecutor(const ReduceAttrs& reduceAttrs, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const ReduceExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(reduceAttrs, srcDescs, dstDescs, attr)) { @@ -52,7 +52,6 @@ class ReduceExecutorFactory : public ExecutorFactoryLegacy { return ptr; }; - if (chosenDesc) { if (auto executor = build(chosenDesc)) { return executor; @@ -81,5 +80,5 @@ class ReduceExecutorFactory : public ExecutorFactoryLegacy { using ReduceExecutorFactoryPtr = std::shared_ptr; using ReduceExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp index 57e2e028827a62..b63e32e39ebf8d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose.cpp @@ -2,9 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "transpose.hpp" + #include + #include "openvino/core/parallel.hpp" -#include "transpose.hpp" namespace ov { namespace intel_cpu { @@ -33,27 +35,27 @@ jit_permute_config_params TransposeExecutor::prepareParams(const PermuteParams& } for (int i = tmp_order.size() - 1; i >= 0; i--) { - int pos = std::distance(std::find( - src_block_order.rbegin(), src_block_order.rend(), tmp_order[i]), src_block_order.rend() - 1); + int pos = std::distance(std::find(src_block_order.rbegin(), src_block_order.rend(), tmp_order[i]), + src_block_order.rend() - 1); if (pos != -1) { new_src_block_strides[i] = src_block_strides[pos]; src_block_order.erase(src_block_order.begin() + pos); src_block_strides.erase(src_block_strides.begin() + pos); mask[i] = 0; } else { - new_src_block_strides[i] = new_src_block_strides[tmp_order.size() - 1] * params.dst_block_dims[tmp_order.size() - 1]; + new_src_block_strides[i] = + new_src_block_strides[tmp_order.size() - 1] * params.dst_block_dims[tmp_order.size() - 1]; mask[i] = 1; mask[tmp_order.size() - 1] = 1; } } if (!src_block_order.empty()) { int pos = std::distance(tmp_order.begin(), std::find(tmp_order.begin(), tmp_order.end(), src_block_order[0])); - new_src_block_strides.insert(new_src_block_strides.begin() + pos, - src_block_strides[0]); - new_dst_block_strides.insert(new_dst_block_strides.begin() + pos, - new_dst_block_strides[pos] * params.src_block_dims[params.src_block_dims.size() - 1]); - new_dst_block_order.insert(new_dst_block_order.begin() + pos, - new_dst_block_order[pos]); + new_src_block_strides.insert(new_src_block_strides.begin() + pos, src_block_strides[0]); + new_dst_block_strides.insert( + new_dst_block_strides.begin() + pos, + new_dst_block_strides[pos] * params.src_block_dims[params.src_block_dims.size() - 1]); + new_dst_block_order.insert(new_dst_block_order.begin() + pos, new_dst_block_order[pos]); new_dst_block_dims.insert(new_dst_block_dims.begin() + pos + 1, params.src_block_dims[params.src_block_dims.size() - 1]); new_dst_block_dims[pos] = div_up(new_dst_block_dims[pos], new_dst_block_dims[pos + 1]); @@ -107,12 +109,12 @@ jit_permute_config_params TransposeExecutor::prepareParams(const PermuteParams& } int max_threads = parallel_get_max_threads(); - const int n_max = 3; // max count dims for parallel + const int n_max = 3; // max count dims for parallel int n = 0; int work_amount = sorted_dst_dims[0]; for (size_t i = 1; i < sorted_dst_dims.size() && n < n_max; i++) { n++; - if (work_amount >= 4 * max_threads) { // 4 * max_threads is a specially selected value for best performance + if (work_amount >= 4 * max_threads) { // 4 * max_threads is a specially selected value for best performance break; } work_amount *= sorted_dst_dims[i]; @@ -128,5 +130,5 @@ jit_permute_config_params TransposeExecutor::prepareParams(const PermuteParams& return jcp; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp index 15f2d5085cd5ad..99e0b0a2742a78 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose.hpp @@ -5,9 +5,9 @@ #pragma once #include "cpu_memory.h" -#include "onednn/iml_type_mapper.h" #include "executor.hpp" #include "nodes/common/permute_kernel.h" +#include "onednn/iml_type_mapper.h" namespace ov { namespace intel_cpu { @@ -23,8 +23,9 @@ class TransposeExecutor : public Executor { virtual bool init(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) = 0; + const dnnl::primitive_attr& attr) = 0; virtual ~TransposeExecutor() = default; + protected: PermuteParams permuteParams; const ExecutorContext::CPtr context; @@ -44,5 +45,5 @@ class TransposeExecutorBuilder { using TransposeExecutorBuilderPtr = std::shared_ptr; using TransposeExecutorBuilderCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp index 31db070d04ffe3..f0e72f4bec1ae2 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp @@ -9,20 +9,19 @@ namespace intel_cpu { const std::vector& getTransposeExecutorsList() { static const std::vector descs = { - OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) + OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) - OV_CPU_INSTANCE_MLAS_ARM64(ExecutorType::Mlas, std::make_shared()) - OV_CPU_INSTANCE_X64(ExecutorType::jit_x64, std::make_shared()) - OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) - }; + OV_CPU_INSTANCE_MLAS_ARM64(ExecutorType::Mlas, std::make_shared()) + OV_CPU_INSTANCE_X64(ExecutorType::jit_x64, std::make_shared()) + OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared())}; return descs; } TransposeExecutorPtr TransposeExecutorFactory::makeExecutor(const TransposeParams& transposeParams, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr) { + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { auto build = [&](const TransposeExecutorDesc* desc) { auto executor = desc->builder->makeExecutor(context); if (executor->init(transposeParams, srcDescs, dstDescs, attr)) { @@ -48,5 +47,5 @@ TransposeExecutorPtr TransposeExecutorFactory::makeExecutor(const TransposeParam OPENVINO_THROW("Supported executor is not found"); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp index 90141a6194592e..c81769fd1d0539 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp @@ -5,19 +5,17 @@ #pragma once #include "executor.hpp" - #include "transpose.hpp" #if defined(OV_CPU_WITH_ACL) -#include "acl/acl_transpose.hpp" +# include "acl/acl_transpose.hpp" #endif +#include "common/primitive_cache.hpp" #include "common/ref_opt_transpose.hpp" #include "common/ref_transpose.hpp" #include "mlas/mlas_transpose.hpp" -#include "x64/jit_transpose.hpp" - #include "onednn/iml_type_mapper.h" -#include "common/primitive_cache.hpp" +#include "x64/jit_transpose.hpp" namespace ov { namespace intel_cpu { @@ -31,22 +29,23 @@ const std::vector& getTransposeExecutorsList(); class TransposeExecutorFactory : public ExecutorFactoryLegacy { public: -TransposeExecutorFactory(const TransposeParams& transposeParams, - const std::vector& srcDescs, - const std::vector& dstDescs, - const ExecutorContext::CPtr context) : ExecutorFactoryLegacy(context) { - for (auto& desc : getTransposeExecutorsList()) { - if (desc.builder->isSupported(transposeParams, srcDescs, dstDescs)) { - supportedDescs.push_back(desc); + TransposeExecutorFactory(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const ExecutorContext::CPtr context) + : ExecutorFactoryLegacy(context) { + for (auto& desc : getTransposeExecutorsList()) { + if (desc.builder->isSupported(transposeParams, srcDescs, dstDescs)) { + supportedDescs.push_back(desc); + } } } -} -~TransposeExecutorFactory() = default; -virtual TransposeExecutorPtr makeExecutor(const TransposeParams& transposeParams, - const std::vector& srcDescs, - const std::vector& dstDescs, - const dnnl::primitive_attr &attr); + ~TransposeExecutorFactory() = default; + virtual TransposeExecutorPtr makeExecutor(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr); private: std::vector supportedDescs; @@ -56,5 +55,5 @@ virtual TransposeExecutorPtr makeExecutor(const TransposeParams& transposeParams using TransposeExecutorFactoryPtr = std::shared_ptr; using TransposeExecutorFactoryCPtr = std::shared_ptr; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp b/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp index d492bd6b6f368a..ef9fdac7f19208 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/type_mask.hpp @@ -14,29 +14,29 @@ namespace intel_cpu { struct TypeMask { enum Value : uint64_t { _undefined = 1 << 0, - _dynamic = 1 << 1, - _boolean = 1 << 2, - _bf16 = 1 << 3, - _f16 = 1 << 4, - _f32 = 1 << 5, - _f64 = 1 << 6, - _i4 = 1 << 7, - _i8 = 1 << 8, - _i16 = 1 << 9, - _i32 = 1 << 10, - _i64 = 1 << 11, - _u1 = 1 << 12, - _u4 = 1 << 13, - _u8 = 1 << 14, - _u16 = 1 << 15, - _u32 = 1 << 16, - _u64 = 1 << 17, - _nf4 = 1 << 18, - _f8e4m3 = 1 << 19, - _f8e5m2 = 1 << 20, - _string = 1 << 21, - _f4e2m1 = 1 << 22, - _f8e8m0 = 1 << 23, + _dynamic = 1 << 1, + _boolean = 1 << 2, + _bf16 = 1 << 3, + _f16 = 1 << 4, + _f32 = 1 << 5, + _f64 = 1 << 6, + _i4 = 1 << 7, + _i8 = 1 << 8, + _i16 = 1 << 9, + _i32 = 1 << 10, + _i64 = 1 << 11, + _u1 = 1 << 12, + _u4 = 1 << 13, + _u8 = 1 << 14, + _u16 = 1 << 15, + _u32 = 1 << 16, + _u64 = 1 << 17, + _nf4 = 1 << 18, + _f8e4m3 = 1 << 19, + _f8e5m2 = 1 << 20, + _string = 1 << 21, + _f4e2m1 = 1 << 22, + _f8e8m0 = 1 << 23, }; TypeMask(const ov::element::Type precision) : value(generateMask(precision)), precision(precision) {} diff --git a/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp new file mode 100644 index 00000000000000..8dfb7a4c63fde4 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/variable_executor.hpp @@ -0,0 +1,140 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "executor.hpp" +#include "executor_config.hpp" +#include "executor_implementation.hpp" +#include "nodes/executors/graph_emitter.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * A stateful (variable) executor + * Contains two or more executors. + * Switches between the executors based on provided Memory (more precisely based on in / out shapes) + */ +template +class VariableExecutor : public Executor { +public: + using ExecutorImplementationRef = std::reference_wrapper>; + + VariableExecutor(const MemoryArgs& memory, + const Attrs& attrs, + const PostOps& postOps, + const ExecutorContext::CPtr context, + std::vector suitableImplementations) + : m_attrs(attrs), + m_postOps(postOps), + m_context(context), + m_suitableImplementations(std::move(suitableImplementations)), + m_implementationRequiresFallback( + cacheFallbackStatus(m_suitableImplementations, + GraphEmitter::createConfig(memory, m_attrs, m_postOps))), + m_executors(m_suitableImplementations.size()) { + const size_t implId = select(memory, 0); + m_executors[implId] = create(implId, memory); + m_implId = implId; + } + + bool update(const MemoryArgs& memory) override { + for (auto implId = select(memory, 0); implId < m_suitableImplementations.size(); + implId = select(memory, implId)) { + if (!m_executors[implId]) { + m_executors[implId] = create(implId, memory); + } + + if (m_executors[implId]->update(memory)) { + m_implId = implId; + return true; + } + } + + return false; + } + + void execute(const MemoryArgs& memory) override { + m_executors[m_implId]->execute(memory); + } + + impl_desc_type implType() const override { + return m_executors[m_implId]->implType(); + } + + void moveMemToNumaNode(int numaID) override { + m_executors[m_implId]->moveMemToNumaNode(numaID); + } + +private: + /** + * @brief Returns a fallback status for each suitable implementation. + */ + static std::vector cacheFallbackStatus(const std::vector& suitableImplementations, + const executor::Config& config) { + std::vector implementationRequiresFallback(suitableImplementations.size()); + std::transform(suitableImplementations.begin(), + suitableImplementations.end(), + implementationRequiresFallback.begin(), + [&config](const ExecutorImplementationRef& impl) { + return impl.get().requiresFallback(config); + }); + + return implementationRequiresFallback; + } + + size_t select(const MemoryArgs& memory, const size_t startIdx) const { + OPENVINO_ASSERT(startIdx < m_suitableImplementations.size(), + "Failed to find an implementation since start indx: ", + startIdx, + " is out of range of the suitable implementations array: ", + m_suitableImplementations.size()); + + auto startIt = m_suitableImplementations.begin() + startIdx; + + const auto selectedImplementation = + std::find_if(startIt, + m_suitableImplementations.end(), + [&memory](const ExecutorImplementationRef& implementation) { + return implementation.get().shapeAgnostic() || implementation.get().acceptsShapes(memory); + }); + + OPENVINO_ASSERT(selectedImplementation != m_suitableImplementations.end(), "Failed to select an implemetation"); + + return std::distance(m_suitableImplementations.begin(), selectedImplementation); + } + + ExecutorPtr create(const size_t implId, const MemoryArgs& memory) { + assert(implId < m_executors.size() && implId < m_suitableImplementations.size()); + + auto createWithFallback = [this](const size_t implId, const MemoryArgs& memory) { + const auto& impl = m_suitableImplementations[implId].get(); + + if (m_implementationRequiresFallback[implId]) { + auto config = GraphEmitter::createConfig(memory, m_attrs, m_postOps); + if (auto fallbackConfig = impl.requiresFallback(config)) { + return GraphEmitter::fallback(config, *fallbackConfig, memory, m_context, impl.name()); + } + } + + return impl.create(m_attrs, m_postOps, memory, m_context); + }; + + return createWithFallback(implId, memory); + } + + const Attrs& m_attrs; + const PostOps& m_postOps; + const ExecutorContext::CPtr m_context; + std::vector m_suitableImplementations; + // stores fallback status to avoid performing the check for every make() call + std::vector m_implementationRequiresFallback; + // executors cache + std::vector m_executors; + size_t m_implId; +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp index bfcc7ad4ae672a..79c578aaacda61 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.cpp @@ -3,6 +3,7 @@ // #include "jit_transpose.hpp" + #include "cpu/x64/cpu_isa_traits.hpp" using namespace dnnl::impl::cpu; @@ -21,9 +22,10 @@ void JitTransposeExecutor::exec(const std::vector& src, const std::v pKernel->execute(srcData, dstData, MB); } -bool JitTransposeExecutor::init(const TransposeParams &transposeParams, - const std::vector &srcDescs, - const std::vector &dstDescs, const dnnl::primitive_attr &attr) { +bool JitTransposeExecutor::init(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs, + const dnnl::primitive_attr& attr) { pKernel = std::make_shared(transposeParams.permuteParams); return true; } @@ -35,9 +37,9 @@ bool JitTransposeExecutorBuilder::isSupported(const TransposeParams& transposePa if (mayiuse(x64::sse41)) { return true; } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 return false; } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp index d37ac9e5db5ef5..fd6d54257f1489 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/x64/jit_transpose.hpp @@ -16,9 +16,12 @@ class JitTransposeExecutor : public TransposeExecutor { bool init(const TransposeParams& transposeParams, const std::vector& srcDescs, const std::vector& dstDescs, - const dnnl::primitive_attr &attr) override; + const dnnl::primitive_attr& attr) override; void exec(const std::vector& src, const std::vector& dst) override; - impl_desc_type implType() const override { return impl_desc_type::jit; } + impl_desc_type implType() const override { + return impl_desc_type::jit; + } + private: std::shared_ptr pKernel; }; @@ -33,5 +36,5 @@ class JitTransposeExecutorBuilder : public TransposeExecutorBuilder { } }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp index 441e013af2cbbf..dc58aabe26635d 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.cpp @@ -2,12 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/experimental_detectron_detection_output.hpp" + #include #include -#include "openvino/op/experimental_detectron_detection_output.hpp" -#include "openvino/core/parallel.hpp" #include "experimental_detectron_detection_output.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -36,13 +37,19 @@ struct Indexer { } }; -static -void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores, - float* refined_boxes, float* refined_boxes_areas, float* refined_scores, - const int rois_num, const int classes_num, - const float img_H, const float img_W, - const float max_delta_log_wh, - float coordinates_offset) { +static void refine_boxes(const float* boxes, + const float* deltas, + const float* weights, + const float* scores, + float* refined_boxes, + float* refined_boxes_areas, + float* refined_scores, + const int rois_num, + const int classes_num, + const float img_H, + const float img_W, + const float max_delta_log_wh, + float coordinates_offset) { Indexer box_idx({rois_num, 4}); Indexer delta_idx({rois_num, classes_num, 4}); Indexer score_idx({rois_num, classes_num}); @@ -114,21 +121,22 @@ static bool SortScorePairDescend(const std::pair>& pa return (pair1.first > pair2.first) || ((pair1.first == pair2.first) && (pair1.second.second < pair2.second.second)); } - struct ConfidenceComparator { explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} bool operator()(int idx1, int idx2) { - if (_conf_data[idx1] > _conf_data[idx2]) return true; - if (_conf_data[idx1] < _conf_data[idx2]) return false; + if (_conf_data[idx1] > _conf_data[idx2]) + return true; + if (_conf_data[idx1] < _conf_data[idx2]) + return false; return idx1 < idx2; } const float* _conf_data; }; -static inline float JaccardOverlap(const float *decoded_bbox, - const float *bbox_sizes, +static inline float JaccardOverlap(const float* decoded_bbox, + const float* bbox_sizes, const int idx1, const int idx2, const float coordinates_offset = 1) { @@ -151,7 +159,7 @@ static inline float JaccardOverlap(const float *decoded_bbox, float intersect_xmax = (std::min)(xmax1, xmax2); float intersect_ymax = (std::min)(ymax1, ymax2); - float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; + float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset; if (intersect_width <= 0 || intersect_height <= 0) { @@ -165,7 +173,6 @@ static inline float JaccardOverlap(const float *decoded_bbox, return intersect_size / (bbox1_size + bbox2_size - intersect_size); } - static void nms_cf(const float* conf_data, const float* bboxes, const float* sizes, @@ -187,8 +194,10 @@ static void nms_cf(const float* conf_data, int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count)); - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, + std::partial_sort_copy(indices, + indices + count, + buffer, + buffer + num_output_scores, ConfidenceComparator(conf_data)); detections = 0; @@ -221,11 +230,13 @@ bool ExperimentalDetectronDetectionOutput::needPrepareParams() const { return false; } -bool ExperimentalDetectronDetectionOutput::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExperimentalDetectronDetectionOutput::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto doOp = ov::as_type_ptr(op); if (!doOp) { - errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; + errorMessage = + "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; return false; } } catch (...) { @@ -294,10 +305,17 @@ void ExperimentalDetectronDetectionOutput::execute(dnnl::stream strm) { Indexer refined_box_idx({classes_num_, rois_num, 4}); Indexer refined_score_idx({classes_num_, rois_num}); - refine_boxes(boxes, deltas, &deltas_weights_[0], scores, - &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0], - rois_num, classes_num_, - img_H, img_W, + refine_boxes(boxes, + deltas, + &deltas_weights_[0], + scores, + &refined_boxes[0], + &refined_boxes_areas[0], + &refined_scores[0], + rois_num, + classes_num_, + img_H, + img_W, max_delta_log_wh_, 1.0f); @@ -353,7 +371,7 @@ void ExperimentalDetectronDetectionOutput::execute(dnnl::stream strm) { memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0])); int i = 0; - for (const auto & detection : conf_index_class_map) { + for (const auto& detection : conf_index_class_map) { float score = detection.first; int cls = detection.second.first; int idx = detection.second.second; @@ -371,6 +389,6 @@ bool ExperimentalDetectronDetectionOutput::created() const { return getType() == Type::ExperimentalDetectronDetectionOutput; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h index 2f76f1004face5..206f807585de7d 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_detection_output.h @@ -14,25 +14,27 @@ class ExperimentalDetectronDetectionOutput : public Node { public: ExperimentalDetectronDetectionOutput(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; bool needShapeInfer() const override; bool needPrepareParams() const override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - const int INPUT_ROIS {0}; - const int INPUT_DELTAS {1}; - const int INPUT_SCORES {2}; - const int INPUT_IM_INFO {3}; + const int INPUT_ROIS{0}; + const int INPUT_DELTAS{1}; + const int INPUT_SCORES{2}; + const int INPUT_IM_INFO{3}; - const int OUTPUT_BOXES {0}; - const int OUTPUT_CLASSES {1}; - const int OUTPUT_SCORES {2}; + const int OUTPUT_BOXES{0}; + const int OUTPUT_CLASSES{1}; + const int OUTPUT_SCORES{2}; float score_threshold_; float nms_threshold_; @@ -44,6 +46,6 @@ class ExperimentalDetectronDetectionOutput : public Node { std::vector deltas_weights_; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp index 33f17c8d95f093..778e796aacc11a 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.cpp @@ -2,22 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include #include #include +#include #include -#include #include -#include +#include #if defined(HAVE_AVX2) -#include +# include #endif -#include "openvino/op/experimental_detectron_generate_proposals.hpp" -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" #include "experimental_detectron_generate_proposals_single_image.h" +#include "openvino/core/parallel.hpp" +#include "openvino/op/experimental_detectron_generate_proposals.hpp" namespace ov { namespace intel_cpu { @@ -29,20 +29,29 @@ struct Indexer4d { int dim23_; int dim123_; - explicit Indexer4d(int dim0, int dim1, int dim2, int dim3): - dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) { + explicit Indexer4d(int dim0, int dim1, int dim2, int dim3) + : dim3_(dim3), + dim23_(dim2 * dim3), + dim123_(dim1 * dim2 * dim3) { (void)dim0; } int operator()(int i, int j, int k, int n) const { - return i * dim123_ + j * dim23_ + k * dim3_ + n; + return i * dim123_ + j * dim23_ + k * dim3_ + n; } }; -void refine_anchors(const float* deltas, const float* scores, const float* anchors, - float* proposals, const int anchors_num, const int bottom_H, - const int bottom_W, const float img_H, const float img_W, - const float min_box_H, const float min_box_W, +void refine_anchors(const float* deltas, + const float* scores, + const float* anchors, + float* proposals, + const int anchors_num, + const int bottom_H, + const int bottom_W, + const float img_H, + const float img_W, + const float min_box_H, + const float min_box_W, const float max_delta_log_wh, float coordinates_offset) { Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W); @@ -108,17 +117,22 @@ void refine_anchors(const float* deltas, const float* scores, const float* ancho void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) { parallel_for(pre_nms_topn, [&](size_t i) { - unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0]; - unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1]; - unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2]; - unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3]; - unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4]; + unpacked_boxes[0 * pre_nms_topn + i] = p_proposals[5 * i + 0]; + unpacked_boxes[1 * pre_nms_topn + i] = p_proposals[5 * i + 1]; + unpacked_boxes[2 * pre_nms_topn + i] = p_proposals[5 * i + 2]; + unpacked_boxes[3 * pre_nms_topn + i] = p_proposals[5 * i + 3]; + unpacked_boxes[4 * pre_nms_topn + i] = p_proposals[5 * i + 4]; }); } -void nms_cpu(const int num_boxes, int is_dead[], - const float* boxes, int index_out[], int* const num_out, - const int base_index, const float nms_thresh, const int max_num_out, +void nms_cpu(const int num_boxes, + int is_dead[], + const float* boxes, + int index_out[], + int* const num_out, + const int base_index, + const float nms_thresh, + const int max_num_out, float coordinates_offset) { const int num_proposals = num_boxes; int count = 0; @@ -131,9 +145,9 @@ void nms_cpu(const int num_boxes, int is_dead[], std::memset(is_dead, 0, num_boxes * sizeof(int)); #if defined(HAVE_AVX2) - __m256 vc_fone = _mm256_set1_ps(coordinates_offset); + __m256 vc_fone = _mm256_set1_ps(coordinates_offset); __m256i vc_ione = _mm256_set1_epi32(1); - __m256 vc_zero = _mm256_set1_ps(0.0f); + __m256 vc_zero = _mm256_set1_ps(0.0f); __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh); #endif @@ -154,13 +168,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1i = _mm256_set1_ps(x1[box]); __m256 vy1i = _mm256_set1_ps(y1[box]); - __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); + __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); __m256 vA_height = _mm256_sub_ps(vy1i, vy0i); - __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); + __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); for (; tail <= num_boxes - 8; tail += 8) { - __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail); - __m256i vdst = _mm256_loadu_si256(pdst); + __m256i* pdst = reinterpret_cast<__m256i*>(is_dead + tail); + __m256i vdst = _mm256_loadu_si256(pdst); __m256 vx0j = _mm256_loadu_ps(x0 + tail); __m256 vy0j = _mm256_loadu_ps(y0 + tail); @@ -172,13 +186,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1 = _mm256_min_ps(vx1i, vx1j); __m256 vy1 = _mm256_min_ps(vy1i, vy1j); - __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); + __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone); __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight)); - __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); + __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); __m256 vB_height = _mm256_sub_ps(vy1j, vy0j); - __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); + __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea); __m256 vintersection_area = _mm256_div_ps(varea, vdivisor); @@ -219,9 +233,9 @@ void nms_cpu(const int num_boxes, int is_dead[], const float y1 = std::min(y1i, y1j); // intersection area - const float width = std::max(0.0f, x1 - x0 + coordinates_offset); - const float height = std::max(0.0f, y1 - y0 + coordinates_offset); - const float area = width * height; + const float width = std::max(0.0f, x1 - x0 + coordinates_offset); + const float height = std::max(0.0f, y1 - y0 + coordinates_offset); + const float area = width * height; // area of A, B const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset); @@ -239,14 +253,18 @@ void nms_cpu(const int num_boxes, int is_dead[], *num_out = count; } -void fill_output_blobs(const float* proposals, const int* roi_indices, - float* rois, float* scores, - const int num_proposals, const int num_rois, const int post_nms_topn) { - const float *src_x0 = proposals + 0 * num_proposals; - const float *src_y0 = proposals + 1 * num_proposals; - const float *src_x1 = proposals + 2 * num_proposals; - const float *src_y1 = proposals + 3 * num_proposals; - const float *src_score = proposals + 4 * num_proposals; +void fill_output_blobs(const float* proposals, + const int* roi_indices, + float* rois, + float* scores, + const int num_proposals, + const int num_rois, + const int post_nms_topn) { + const float* src_x0 = proposals + 0 * num_proposals; + const float* src_y0 = proposals + 1 * num_proposals; + const float* src_x1 = proposals + 2 * num_proposals; + const float* src_y1 = proposals + 3 * num_proposals; + const float* src_score = proposals + 4 * num_proposals; parallel_for(num_rois, [&](size_t i) { int index = roi_indices[i]; @@ -269,10 +287,11 @@ void fill_output_blobs(const float* proposals, const int* roi_indices, } // namespace -bool ExperimentalDetectronGenerateProposalsSingleImage::isSupportedOperation - (const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExperimentalDetectronGenerateProposalsSingleImage::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { - const auto proposalOp = ov::as_type_ptr(op); + const auto proposalOp = + ov::as_type_ptr(op); if (!proposalOp) { errorMessage = "Node is not an instance of the Proposal from the operations set v0."; return false; @@ -313,8 +332,7 @@ void ExperimentalDetectronGenerateProposalsSingleImage::initSupportedPrimitiveDe {LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, - {{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + {{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -325,13 +343,13 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str } size_t anchor_dims_size = 1; - const auto &anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); + const auto& anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); for (size_t i = 0; i < anchorDims.size(); i++) { anchor_dims_size *= anchorDims[i]; } size_t deltas_dims_size = 1; - const auto &deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); + const auto& deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); for (size_t i = 0; i < deltaDims.size(); i++) { deltas_dims_size *= deltaDims[i]; } @@ -339,7 +357,7 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str OPENVINO_THROW("'Anchors' blob size for ONNXProposal is incompatible with 'deltas' blob size!"); size_t score_dims_size = 1; - const auto &scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); + const auto& scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); for (size_t i = 0; i < scoreDims.size(); i++) { score_dims_size *= scoreDims[i]; } @@ -347,13 +365,13 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str OPENVINO_THROW("'Deltas' blob size for ONNXProposal is incompatible with 'scores' blob size!"); // Prepare memory - const float *p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); - const float *p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); - const float *p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); - const float *p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); + const float* p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); + const float* p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); + const float* p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); + const float* p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); - float *p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); - float *p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); + float* p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); + float* p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); const int anchors_num = scoreDims[0]; @@ -398,24 +416,45 @@ void ExperimentalDetectronGenerateProposalsSingleImage::execute(dnnl::stream str // Execute int batch_size = 1; // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0]; for (int n = 0; n < batch_size; ++n) { - refine_anchors(p_deltas_item, p_scores_item, p_anchors_item, - reinterpret_cast(&proposals_[0]), anchors_num, bottom_H, - bottom_W, img_H, img_W, - min_box_H, min_box_W, + refine_anchors(p_deltas_item, + p_scores_item, + p_anchors_item, + reinterpret_cast(&proposals_[0]), + anchors_num, + bottom_H, + bottom_W, + img_H, + img_W, + min_box_H, + min_box_W, static_cast(std::log(1000. / 16.)), 1.0f); - std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), - [](const ProposalBox &struct1, const ProposalBox &struct2) { + std::partial_sort(proposals_.begin(), + proposals_.begin() + pre_nms_topn, + proposals_.end(), + [](const ProposalBox& struct1, const ProposalBox& struct2) { return (struct1.score > struct2.score); }); - unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); - nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, - nms_thresh_, post_nms_topn_, coordinates_offset); - fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item, - pre_nms_topn, num_rois, post_nms_topn_); + unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); + nms_cpu(pre_nms_topn, + &is_dead[0], + &unpacked_boxes[0], + &roi_indices_[0], + &num_rois, + 0, + nms_thresh_, + post_nms_topn_, + coordinates_offset); + fill_output_blobs(&unpacked_boxes[0], + &roi_indices_[0], + p_roi_item, + p_roi_score_item, + pre_nms_topn, + num_rois, + post_nms_topn_); } - } catch (const std::exception &e) { + } catch (const std::exception& e) { std::string errorMsg = e.what(); OPENVINO_THROW(errorMsg); } @@ -433,6 +472,6 @@ bool ExperimentalDetectronGenerateProposalsSingleImage::needPrepareParams() cons return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h index 41aaf63f637e76..d747813e10b258 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_generate_proposals_single_image.h @@ -13,16 +13,18 @@ namespace node { class ExperimentalDetectronGenerateProposalsSingleImage : public Node { public: ExperimentalDetectronGenerateProposalsSingleImage(const std::shared_ptr& op, - const GraphContext::CPtr context); + const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; bool needShapeInfer() const override; bool needPrepareParams() const override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: @@ -32,12 +34,12 @@ class ExperimentalDetectronGenerateProposalsSingleImage : public Node { // Outputs: // top_rois, shape [max_rois, 4] - const int INPUT_IM_INFO {0}; - const int INPUT_ANCHORS {1}; - const int INPUT_DELTAS {2}; - const int INPUT_SCORES {3}; - const int OUTPUT_ROIS {0}; - const int OUTPUT_SCORES {1}; + const int INPUT_IM_INFO{0}; + const int INPUT_ANCHORS{1}; + const int INPUT_DELTAS{2}; + const int INPUT_SCORES{3}; + const int OUTPUT_ROIS{0}; + const int OUTPUT_SCORES{1}; float min_size_; int pre_nms_topn_; @@ -48,6 +50,6 @@ class ExperimentalDetectronGenerateProposalsSingleImage : public Node { std::vector roi_indices_; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp index eead95def0a8fb..f7df0e533778ed 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.cpp @@ -2,20 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "experimental_detectron_priorgridgenerator.h" #include +#include + #include "openvino/core/parallel.hpp" -#include "experimental_detectron_priorgridgenerator.h" namespace ov { namespace intel_cpu { namespace node { bool ExperimentalDetectronPriorGridGenerator::isSupportedOperation(const std::shared_ptr& op, - std::string& errorMessage) noexcept { + std::string& errorMessage) noexcept { try { - const auto priorGridGen = std::dynamic_pointer_cast(op); + const auto priorGridGen = + std::dynamic_pointer_cast(op); if (!priorGridGen) { errorMessage = "Only opset6 ExperimentalDetectronPriorGridGenerator operation is supported"; return false; @@ -39,7 +41,7 @@ ExperimentalDetectronPriorGridGenerator::ExperimentalDetectronPriorGridGenerator if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1) OPENVINO_THROW(errorPrefix, " has incorrect number of input/output edges!"); - const auto &attr = priorGridGen->get_attrs(); + const auto& attr = priorGridGen->get_attrs(); grid_w_ = attr.w; grid_h_ = attr.h; stride_h_ = attr.stride_y; @@ -64,11 +66,15 @@ void ExperimentalDetectronPriorGridGenerator::execute(dnnl::stream strm) { // Execute const int layer_width = grid_w_ ? grid_w_ : getParentEdgeAt(INPUT_FEATUREMAP)->getMemory().getStaticDims()[3]; const int layer_height = grid_h_ ? grid_h_ : getParentEdgeAt(INPUT_FEATUREMAP)->getMemory().getStaticDims()[2]; - const float step_w = stride_w_ ? stride_w_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[3]) / layer_width; - const float step_h = stride_h_ ? stride_h_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[2]) / layer_height; + const float step_w = + stride_w_ ? stride_w_ + : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[3]) / layer_width; + const float step_h = + stride_h_ ? stride_h_ + : static_cast(getParentEdgeAt(INPUT_IMAGE)->getMemory().getStaticDims()[2]) / layer_height; - const auto *bottom_data_0 = getSrcDataAtPortAs(0); - auto *top_data_0 = getDstDataAtPortAs(OUTPUT_ROIS); + const auto* bottom_data_0 = getSrcDataAtPortAs(0); + auto* top_data_0 = getDstDataAtPortAs(OUTPUT_ROIS); for (int h = 0; h < layer_height; ++h) { for (int w = 0; w < layer_width; ++w) { @@ -91,6 +97,6 @@ bool ExperimentalDetectronPriorGridGenerator::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h index cf52b4e5c9b934..47c2c16dc558b9 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_priorgridgenerator.h @@ -14,13 +14,15 @@ class ExperimentalDetectronPriorGridGenerator : public Node { public: ExperimentalDetectronPriorGridGenerator(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; bool needPrepareParams() const override; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: @@ -31,11 +33,11 @@ class ExperimentalDetectronPriorGridGenerator : public Node { // Outputs: // priors_grid, shape [m, 4] - const int INPUT_PRIORS {0}; - const int INPUT_FEATUREMAP {1}; - const int INPUT_IMAGE {2}; + const int INPUT_PRIORS{0}; + const int INPUT_FEATUREMAP{1}; + const int INPUT_IMAGE{2}; - const int OUTPUT_ROIS {0}; + const int OUTPUT_ROIS{0}; int grid_w_; int grid_h_; @@ -45,6 +47,6 @@ class ExperimentalDetectronPriorGridGenerator : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp index c92e3c2594d4a9..05f2202537f986 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.cpp @@ -2,14 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "experimental_detectron_roifeatureextractor.h" + +#include +#include #include #include -#include -#include -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" -#include "experimental_detectron_roifeatureextractor.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { @@ -30,31 +31,28 @@ struct PreCalc { }; template -void pre_calc_for_bilinear_interpolate( - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int iy_upper, - const int ix_upper, - T roi_start_h, - T roi_start_w, - T bin_size_h, - T bin_size_w, - int roi_bin_grid_h, - int roi_bin_grid_w, - std::vector>& pre_calc) { +void pre_calc_for_bilinear_interpolate(const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { for (int pw = 0; pw < pooled_width; pw++) { for (int iy = 0; iy < iy_upper; iy++) { const T yy = roi_start_h + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < ix_upper; ix++) { const T xx = roi_start_w + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T x = xx; T y = yy; @@ -126,19 +124,18 @@ void pre_calc_for_bilinear_interpolate( } template -void ROIAlignForward_cpu_kernel( - const int nthreads, - const T* bottom_data, - const T& spatial_scale, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int sampling_ratio, - const T* bottom_rois, - const bool aligned, - T* top_data) { +void ROIAlignForward_cpu_kernel(const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + const bool aligned, + T* top_data) { int roi_cols = 4; int n_rois = nthreads / channels / pooled_width / pooled_height; @@ -168,38 +165,33 @@ void ROIAlignForward_cpu_kernel( T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : static_cast(ceil(roi_height / pooled_height)); // e.g., = 2 - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_width / pooled_width)); + int roi_bin_grid_h = + (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_height / pooled_height)); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_width / pooled_width)); // We do average (integral) pooling inside a bin const T count = static_cast(roi_bin_grid_h * roi_bin_grid_w); // e.g. = 4 // we want to precalculate indices and weights shared by all chanels, // this is the key point of optimiation - std::vector> pre_calc( - roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); - pre_calc_for_bilinear_interpolate( - height, - width, - pooled_height, - pooled_width, - roi_bin_grid_h, - roi_bin_grid_w, - roi_start_h, - roi_start_w, - bin_size_h, - bin_size_w, - roi_bin_grid_h, - roi_bin_grid_w, - pre_calc); + std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate(height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); for (int c = 0; c < channels; c++) { int index_n_c = index_n + c * pooled_width * pooled_height; - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; + const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; int pre_calc_index = 0; for (int ph = 0; ph < pooled_height; ph++) { @@ -210,10 +202,8 @@ void ROIAlignForward_cpu_kernel( for (int iy = 0; iy < roi_bin_grid_h; iy++) { for (int ix = 0; ix < roi_bin_grid_w; ix++) { PreCalc pc = pre_calc[pre_calc_index]; - output_val += pc.w1 * offset_bottom_data[pc.pos1] + - pc.w2 * offset_bottom_data[pc.pos2] + - pc.w3 * offset_bottom_data[pc.pos3] + - pc.w4 * offset_bottom_data[pc.pos4]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4]; pre_calc_index += 1; } @@ -222,14 +212,12 @@ void ROIAlignForward_cpu_kernel( top_data[index] = output_val; } // for pw - } // for ph - } // for c + } // for ph + } // for c }); } - -void redistribute_rois(const float* rois, int* level_ids, - const int num_rois, const int levels_num) { +void redistribute_rois(const float* rois, int* level_ids, const int num_rois, const int levels_num) { const float canonical_scale = 224.0f; const int canonical_level = 2; @@ -252,11 +240,11 @@ void redistribute_rois(const float* rois, int* level_ids, } } - -void reord(const float* src_data, const int* ranks, const int n, const int step, float* dst_data, - int* dst_mapping) { +void reord(const float* src_data, const int* ranks, const int n, const int step, float* dst_data, int* dst_mapping) { std::iota(dst_mapping, dst_mapping + n, 0); - std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];}); + std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) { + return ranks[i1] < ranks[i2]; + }); for (int i = 0; i < n; ++i) { const int j = dst_mapping[i]; assert(0 <= j && j < n); @@ -277,12 +265,13 @@ void split_points(const std::vector& ids, std::vector& rois_per_level, rois_per_level.insert(rois_per_level.begin(), 0); } -} // namespace +} // namespace bool ExperimentalDetectronROIFeatureExtractor::isSupportedOperation(const std::shared_ptr& op, - std::string& errorMessage) noexcept { + std::string& errorMessage) noexcept { try { - const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); + const auto roiFeatureExtractor = + std::dynamic_pointer_cast(op); if (!roiFeatureExtractor) { errorMessage = "Only opset6 ExperimentalDetectronROIFeatureExtractor operation is supported"; return false; @@ -301,8 +290,9 @@ ExperimentalDetectronROIFeatureExtractor::ExperimentalDetectronROIFeatureExtract OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } - const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); - const auto &attr = roiFeatureExtractor->get_attrs(); + const auto roiFeatureExtractor = + std::dynamic_pointer_cast(op); + const auto& attr = roiFeatureExtractor->get_attrs(); output_dim_ = attr.output_size; pyramid_scales_ = attr.pyramid_scales; sampling_ratio_ = attr.sampling_ratio; @@ -321,8 +311,7 @@ void ExperimentalDetectronROIFeatureExtractor::initSupportedPrimitiveDescriptors inDataConf.emplace_back(LayoutType::ncsp, ov::element::f32); addSupportedPrimDesc(inDataConf, - {{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + {{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -332,15 +321,15 @@ void ExperimentalDetectronROIFeatureExtractor::execute(dnnl::stream strm) { const int channels_num = getParentEdgeAt(INPUT_FEATURES_START)->getMemory().getStaticDims()[1]; const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num; - auto *input_rois = getSrcDataAtPortAs(INPUT_ROIS); - auto *output_rois_features = getDstDataAtPortAs(OUTPUT_ROI_FEATURES); - float *output_rois = nullptr; + auto* input_rois = getSrcDataAtPortAs(INPUT_ROIS); + auto* output_rois_features = getDstDataAtPortAs(OUTPUT_ROI_FEATURES); + float* output_rois = nullptr; if (OUTPUT_ROIS < outputShapes.size()) { output_rois = getDstDataAtPortAs(OUTPUT_ROIS); } std::vector level_ids(num_rois, 0); - redistribute_rois(input_rois, reinterpret_cast(&level_ids[0]), num_rois, levels_num); + redistribute_rois(input_rois, reinterpret_cast(&level_ids[0]), num_rois, levels_num); std::vector reordered_rois(4 * num_rois, 0); std::vector original_rois_mapping(num_rois, 0); @@ -354,7 +343,7 @@ void ExperimentalDetectronROIFeatureExtractor::execute(dnnl::stream strm) { const int level_rois_offset = rois_per_level[i]; const int level_rois_num = rois_per_level[i + 1] - level_rois_offset; if (level_rois_num > 0) { - auto *featuremap = getSrcDataAtPortAs(INPUT_FEATURES_START + i); + auto* featuremap = getSrcDataAtPortAs(INPUT_FEATURES_START + i); const int featuremap_height = getParentEdgeAt(INPUT_FEATURES_START + i)->getMemory().getStaticDims()[2]; const int featuremap_width = getParentEdgeAt(INPUT_FEATURES_START + i)->getMemory().getStaticDims()[3]; ROIAlignForward_cpu_kernel(feaxels_per_roi * level_rois_num, @@ -373,8 +362,12 @@ void ExperimentalDetectronROIFeatureExtractor::execute(dnnl::stream strm) { } std::vector dummy_mapping(num_rois, 0); - reord(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi, - output_rois_features, &dummy_mapping[0]); + reord(&output_rois_features_temp[0], + &original_rois_mapping[0], + num_rois, + feaxels_per_roi, + output_rois_features, + &dummy_mapping[0]); if (output_rois != nullptr) { cpu_memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float)); } @@ -384,6 +377,6 @@ bool ExperimentalDetectronROIFeatureExtractor::created() const { return getType() == Type::ExperimentalDetectronROIFeatureExtractor; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h index 94bfdfd224d0c5..374fd62c61b776 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_roifeatureextractor.h @@ -14,22 +14,26 @@ class ExperimentalDetectronROIFeatureExtractor : public Node { public: ExperimentalDetectronROIFeatureExtractor(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override { return false; }; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); }; + bool needPrepareParams() const override { + return false; + }; + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + }; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: - const int INPUT_ROIS {0}; - const int INPUT_FEATURES_START {1}; + const int INPUT_ROIS{0}; + const int INPUT_FEATURES_START{1}; - const int OUTPUT_ROI_FEATURES {0}; - const size_t OUTPUT_ROIS {1}; + const int OUTPUT_ROI_FEATURES{0}; + const size_t OUTPUT_ROIS{1}; int output_dim_ = 0; int pooled_height_ = 0; @@ -39,6 +43,6 @@ class ExperimentalDetectronROIFeatureExtractor : public Node { bool aligned_ = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp index 46b60fcdb83efd..f09d96ac7a7f7e 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.cpp @@ -2,20 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "experimental_detectron_topkrois.h" + +#include +#include #include #include -#include -#include -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" -#include "experimental_detectron_topkrois.h" +#include "openvino/core/parallel.hpp" namespace ov { namespace intel_cpu { namespace node { -bool ExperimentalDetectronTopKROIs::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExperimentalDetectronTopKROIs::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { const auto topKROI = std::dynamic_pointer_cast(op); if (!topKROI) { @@ -56,8 +58,7 @@ void ExperimentalDetectronTopKROIs::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, - {LayoutType::ncsp, ov::element::f32}}, + addSupportedPrimDesc({{LayoutType::ncsp, ov::element::f32}, {LayoutType::ncsp, ov::element::f32}}, {{LayoutType::ncsp, ov::element::f32}}, impl_desc_type::ref_any); } @@ -66,14 +67,16 @@ void ExperimentalDetectronTopKROIs::execute(dnnl::stream strm) { const int input_rois_num = getParentEdgeAt(INPUT_ROIS)->getMemory().getStaticDims()[0]; const int top_rois_num = (std::min)(max_rois_num_, input_rois_num); - auto *input_rois = getSrcDataAtPortAs(INPUT_ROIS); - auto *input_probs = getSrcDataAtPortAs(INPUT_PROBS); - auto *output_rois = getDstDataAtPortAs(OUTPUT_ROIS); + auto* input_rois = getSrcDataAtPortAs(INPUT_ROIS); + auto* input_probs = getSrcDataAtPortAs(INPUT_PROBS); + auto* output_rois = getDstDataAtPortAs(OUTPUT_ROIS); std::vector idx(input_rois_num); iota(idx.begin(), idx.end(), 0); // FIXME. partial_sort is enough here. - sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) {return input_probs[i1] > input_probs[i2];}); + sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) { + return input_probs[i1] > input_probs[i2]; + }); for (int i = 0; i < top_rois_num; ++i) { cpu_memcpy(output_rois + 4 * i, input_rois + 4 * idx[i], 4 * sizeof(float)); @@ -84,6 +87,6 @@ bool ExperimentalDetectronTopKROIs::created() const { return getType() == Type::ExperimentalDetectronTopKROIs; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h index 5c2db1fa2303ea..3fe134948d5e45 100644 --- a/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h +++ b/src/plugins/intel_cpu/src/nodes/experimental_detectron_topkrois.h @@ -14,14 +14,20 @@ class ExperimentalDetectronTopKROIs : public Node { public: ExperimentalDetectronTopKROIs(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needShapeInfer() const override { return false; }; - bool needPrepareParams() const override { return false; }; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); }; + bool needShapeInfer() const override { + return false; + }; + bool needPrepareParams() const override { + return false; + }; + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + }; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -32,15 +38,15 @@ class ExperimentalDetectronTopKROIs : public Node { // Outputs: // top_rois, shape [max_rois, 4] - const int INPUT_ROIS {0}; - const int INPUT_PROBS {1}; + const int INPUT_ROIS{0}; + const int INPUT_PROBS{1}; - const int OUTPUT_ROIS {0}; + const int OUTPUT_ROIS{0}; int max_rois_num_; std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp b/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp index 8b5d0b510614e1..51ae2123bbd382 100644 --- a/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp +++ b/src/plugins/intel_cpu/src/nodes/extract_image_patches.cpp @@ -3,15 +3,16 @@ // #include "extract_image_patches.h" -#include "common/primitive_hashing_utils.hpp" -#include "cpu/x64/jit_generator.hpp" -#include "openvino/core/parallel.hpp" -#include "openvino/opsets/opset3.hpp" #include #include #include +#include "common/primitive_hashing_utils.hpp" +#include "cpu/x64/jit_generator.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/opsets/opset3.hpp" + using namespace dnnl::impl::cpu; using namespace dnnl::impl::cpu::x64; using namespace dnnl::impl::utils; @@ -21,13 +22,15 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_extract_image_patches_args, field) +# define GET_OFF(field) offsetof(jit_extract_image_patches_args, field) template struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_extract_image_patches_kernel) - explicit jit_extract_image_patches_kernel(jit_extract_image_patches_params jpp) : jit_uni_extract_image_patches_kernel(jpp), jit_generator(jit_name()) {} + explicit jit_extract_image_patches_kernel(jit_extract_image_patches_params jpp) + : jit_uni_extract_image_patches_kernel(jpp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -92,35 +95,47 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k Vmm vmm = Vmm(0); Xmm xmm = Xmm(0); - Vmm vmm_zero = Vmm(1); // reserved for pad + Vmm vmm_zero = Vmm(1); // reserved for pad Xbyak::Xmm xmm_aux = Xbyak::Xmm(2); Vmm vmm_gather_index = Vmm(3); Vmm vmm_gather_mask = Vmm(4); Opmask k_mask = Xbyak::Opmask(1); Xbyak::Label gather_index_table; - inline void load_scalar(Vmm vmm_arg, const Xbyak::Address &op) { + inline void load_scalar(Vmm vmm_arg, const Xbyak::Address& op) { Xbyak::Xmm xmm_src = Xmm(vmm_arg.getIdx()); switch (jpp.dtype_size) { - case 4: uni_vmovss(vmm_arg, op); break; - case 2: uni_vpinsrw(xmm_src, xmm_src, op, 0x0); break; - case 1: uni_vpinsrb(xmm_src, xmm_src, op, 0x0); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + uni_vmovss(vmm_arg, op); + break; + case 2: + uni_vpinsrw(xmm_src, xmm_src, op, 0x0); + break; + case 1: + uni_vpinsrb(xmm_src, xmm_src, op, 0x0); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } - inline void store_scalar(const Xbyak::Address &op, Vmm vmm_arg) { + inline void store_scalar(const Xbyak::Address& op, Vmm vmm_arg) { Xbyak::Xmm xmm_dst = Xmm(vmm_arg.getIdx()); switch (jpp.dtype_size) { - case 4: uni_vmovss(op, vmm_arg); break; - case 2: uni_vpextrw(op, xmm_dst, 0x0); break; - case 1: uni_vpextrb(op, xmm_dst, 0x0); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + uni_vmovss(op, vmm_arg); + break; + case 2: + uni_vpextrw(op, xmm_dst, 0x0); + break; + case 1: + uni_vpextrb(op, xmm_dst, 0x0); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } - inline void pad_with_zeros(reg64_t ®_num_pads_arg, reg64_t ®_dst_arg) { + inline void pad_with_zeros(reg64_t& reg_num_pads_arg, reg64_t& reg_dst_arg) { Xbyak::Label main, tail, exit; L(main); { @@ -143,57 +158,67 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k L(exit); } - inline void custom_uni_vgatherdps(const Vmm &vmm_arg, reg64_t &mem_base, const Vmm &mem_offset, Vmm &vmm_mask) { + inline void custom_uni_vgatherdps(const Vmm& vmm_arg, reg64_t& mem_base, const Vmm& mem_offset, Vmm& vmm_mask) { switch (isa) { - case x64::avx2: - uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_arg, ptr[mem_base + mem_offset], vmm_mask); - break; - case x64::avx512_core: - kxnord(k_mask, k_mask, k_mask); - vgatherdps(vmm_arg | k_mask, ptr[mem_base + mem_offset]); - break; - case x64::sse41: - emulate_gather(vmm_arg, mem_base); - break; - default: - OPENVINO_THROW("Got unsupported instruction set."); + case x64::avx2: + uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); + vgatherdps(vmm_arg, ptr[mem_base + mem_offset], vmm_mask); + break; + case x64::avx512_core: + kxnord(k_mask, k_mask, k_mask); + vgatherdps(vmm_arg | k_mask, ptr[mem_base + mem_offset]); + break; + case x64::sse41: + emulate_gather(vmm_arg, mem_base); + break; + default: + OPENVINO_THROW("Got unsupported instruction set."); } } - inline void gather_src2vmm(const Vmm &vmm_arg, reg64_t &mem_base) { + inline void gather_src2vmm(const Vmm& vmm_arg, reg64_t& mem_base) { switch (jpp.dtype_size) { - case 4: custom_uni_vgatherdps(vmm, mem_base, vmm_gather_index, vmm_gather_mask); break; - case 2: - case 1: emulate_gather(vmm_arg, mem_base); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + custom_uni_vgatherdps(vmm, mem_base, vmm_gather_index, vmm_gather_mask); + break; + case 2: + case 1: + emulate_gather(vmm_arg, mem_base); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } - inline void emulate_gather(const Xbyak::Xmm &xmm_arg, reg64_t &mem_base, int xmm_offset = 0) { - const int xmm_size = 16; // bytes + inline void emulate_gather(const Xbyak::Xmm& xmm_arg, reg64_t& mem_base, int xmm_offset = 0) { + const int xmm_size = 16; // bytes const int xmm_block_size = xmm_size / jpp.dtype_size; const int offset = xmm_offset * jpp.SW * jpp.dtype_size * xmm_block_size; for (int i = 0; i < xmm_block_size; i++) { Xbyak::Address addr = ptr[mem_base + i * jpp.SW * jpp.dtype_size + offset]; switch (jpp.dtype_size) { - case 4: uni_vpinsrd(xmm_arg, xmm_arg, addr, i); break; - case 2: uni_vpinsrw(xmm_arg, xmm_arg, addr, i); break; - case 1: uni_vpinsrb(xmm_arg, xmm_arg, addr, i); break; - default: - OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); + case 4: + uni_vpinsrd(xmm_arg, xmm_arg, addr, i); + break; + case 2: + uni_vpinsrw(xmm_arg, xmm_arg, addr, i); + break; + case 1: + uni_vpinsrb(xmm_arg, xmm_arg, addr, i); + break; + default: + OPENVINO_THROW("The data type of size '", jpp.dtype_size, "' is not supported."); } } } - inline void emulate_gather(const Xbyak::Ymm &ymm_arg, reg64_t &mem_base) { + inline void emulate_gather(const Xbyak::Ymm& ymm_arg, reg64_t& mem_base) { Xbyak::Xmm low_xmm = Xbyak::Xmm(ymm_arg.getIdx()); emulate_gather(low_xmm, mem_base, 0); emulate_gather(xmm_aux, mem_base, 1); vinserti128(ymm_arg, ymm_arg, xmm_aux, 1); } - inline void emulate_gather(const Xbyak::Zmm &zmm_arg, reg64_t &mem_base) { + inline void emulate_gather(const Xbyak::Zmm& zmm_arg, reg64_t& mem_base) { Xbyak::Xmm low_xmm = Xbyak::Xmm(zmm_arg.getIdx()); emulate_gather(low_xmm, mem_base, 0); for (int i = 1; i < 4; i++) { @@ -270,9 +295,10 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k dd(i * jpp.SW * jpp.dtype_size); } }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 -bool ExtractImagePatches::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool ExtractImagePatches::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { auto extImgPatcher = ov::as_type_ptr(op); if (!extImgPatcher) { @@ -284,7 +310,10 @@ bool ExtractImagePatches::isSupportedOperation(const std::shared_ptrget_sizes().size(), extImgPatcher->get_strides().size(), extImgPatcher->get_rates().size())) { + if (!everyone_is(2u, + extImgPatcher->get_sizes().size(), + extImgPatcher->get_strides().size(), + extImgPatcher->get_rates().size())) { errorMessage = "Doesn't support 'sizes', 'strides', 'rates', attributes with rank != 2"; return false; } @@ -323,7 +352,7 @@ size_t ExtractImagePatchesKey::hash() const { bool ExtractImagePatchesKey::operator==(const ExtractImagePatchesKey& rhs) const { bool result = inDims == rhs.inDims && outDims == rhs.outDims && kSizes == rhs.kSizes && strides == rhs.strides && - rates == rhs.rates && padType == rhs.padType && prcSize == rhs.prcSize; + rates == rhs.rates && padType == rhs.padType && prcSize == rhs.prcSize; return result; } } // namespace @@ -362,7 +391,8 @@ ExtractImagePatches::ExtractImagePatches(const std::shared_ptr& op, co OPENVINO_THROW(errorPrefix, "has unsupported pad type: ", extImgPatcher->get_auto_pad()); } - _ksizes = extImgPatcher->get_sizes();; + _ksizes = extImgPatcher->get_sizes(); + ; _strides = extImgPatcher->get_strides(); _rates = extImgPatcher->get_rates(); if (_ksizes.size() != 2 || _strides.size() != 2 || _rates.size() != 2) @@ -416,9 +446,7 @@ void ExtractImagePatches::initSupportedPrimitiveDescriptors() { if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end()) OPENVINO_THROW(errorPrefix, "has unsupported precision: ", precision.get_type_name()); - addSupportedPrimDesc({{LayoutType::ncsp, precision}}, - {{LayoutType::ncsp, precision}}, - impl_desc_type::ref_any); + addSupportedPrimDesc({{LayoutType::ncsp, precision}}, {{LayoutType::ncsp, precision}}, impl_desc_type::ref_any); } void ExtractImagePatches::execute(dnnl::stream strm) { @@ -437,12 +465,17 @@ void ExtractImagePatches::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const { +void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) const { const char* src_data = reinterpret_cast(src); char* dst_data = reinterpret_cast(dst); - const std::vector ostrides_partial = { ostrides[0], jpp.KW * IC * ostrides[1], IC * ostrides[1], ostrides[1] }; + const std::vector ostrides_partial = {ostrides[0], + jpp.KW * IC * ostrides[1], + IC * ostrides[1], + ostrides[1]}; parallel_for4d(OB, jpp.KH, jpp.KW, IC, [&](const size_t ob, const size_t kh, const size_t kw, const size_t ic) { const int64_t iw_start = static_cast(kw * RW) - PL; @@ -450,12 +483,17 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( const size_t ih_lpad = ih_start >= 0 ? 0 : std::ceil(-1.f * ih_start / jpp.SH); const size_t iw_lpad = iw_start >= 0 ? 0 : std::ceil(-1.f * iw_start / jpp.SW); - const size_t ih_hpad = std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH + -1.f * ih_start) / jpp.SH); - const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW ? jpp.OW : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); + const size_t ih_hpad = + std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH + -1.f * ih_start) / jpp.SH); + const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW + ? jpp.OW + : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); - char* my_dst_ptr = dst_data + - (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * jpp.dtype_size; - const char* my_src_ptr = src_data + (ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start) * jpp.dtype_size; + char* my_dst_ptr = dst_data + (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + + ic * ostrides_partial[3]) * + jpp.dtype_size; + const char* my_src_ptr = + src_data + (ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start) * jpp.dtype_size; size_t num_bytes_to_set = ih_lpad * jpp.OW * jpp.dtype_size; memset(my_dst_ptr, 0, num_bytes_to_set); @@ -463,14 +501,15 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( const char* src_ptr_h_stop = my_src_ptr + ih_hpad * jpp.SH * jpp.IW * jpp.dtype_size; for (const char* src_h_ptr = my_src_ptr + ih_lpad * jpp.SH * jpp.IW * jpp.dtype_size; - src_h_ptr < src_ptr_h_stop; src_h_ptr += jpp.SH * jpp.IW * jpp.dtype_size) { + src_h_ptr < src_ptr_h_stop; + src_h_ptr += jpp.SH * jpp.IW * jpp.dtype_size) { num_bytes_to_set = iw_lpad * jpp.dtype_size; memset(my_dst_ptr, 0, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; const char* src_ptr_w_stop = src_h_ptr + iw_hpad * jpp.SW * jpp.dtype_size; - for (const char* src_w_ptr = src_h_ptr + iw_lpad * jpp.SW * jpp.dtype_size; - src_w_ptr < src_ptr_w_stop; src_w_ptr += jpp.SW * jpp.dtype_size) { + for (const char* src_w_ptr = src_h_ptr + iw_lpad * jpp.SW * jpp.dtype_size; src_w_ptr < src_ptr_w_stop; + src_w_ptr += jpp.SW * jpp.dtype_size) { num_bytes_to_set = jpp.dtype_size; memcpy(my_dst_ptr, src_w_ptr, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; @@ -484,25 +523,35 @@ void ExtractImagePatches::ExtractImagePatchesRefExecutor::executeReference( }); } -void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneric( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const { +void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneric(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) const { #if defined(OPENVINO_ARCH_X86_64) const char* src_data = reinterpret_cast(src); char* dst_data = reinterpret_cast(dst); const auto& jpp = pKernel->jpp; - const std::vector ostrides_partial = { ostrides[0], jpp.KW * IC * ostrides[1], IC * ostrides[1], ostrides[1] }; + const std::vector ostrides_partial = {ostrides[0], + jpp.KW * IC * ostrides[1], + IC * ostrides[1], + ostrides[1]}; parallel_for4d(OB, jpp.KH, jpp.KW, IC, [&](const size_t ob, const size_t kh, const size_t kw, const size_t ic) { const int64_t ih_start = kh * RH - PT; const int64_t iw_start = kw * RW - PL; const size_t ih_lpad = ih_start >= 0 ? 0 : std::ceil(-1.f * ih_start / jpp.SH); const size_t iw_lpad = iw_start >= 0 ? 0 : std::ceil(-1.f * iw_start / jpp.SW); - const size_t ih_hpad = std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH - 1.f * ih_start) / jpp.SH); - const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW ? jpp.OW : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); + const size_t ih_hpad = + std::ceil((IH - 1.f * ih_start) / jpp.SH) > jpp.OH ? jpp.OH : std::ceil((IH - 1.f * ih_start) / jpp.SH); + const size_t iw_hpad = std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW) > jpp.OW + ? jpp.OW + : std::ceil((jpp.IW - 1.f * iw_start) / jpp.SW); - size_t dst_offset = ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]; - size_t src_offset = ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start + ih_lpad * jpp.SH * jpp.IW; + size_t dst_offset = + ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]; + size_t src_offset = + ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start + ih_lpad * jpp.SH * jpp.IW; auto args = jit_extract_image_patches_args(); args.src = src_data + src_offset * jpp.dtype_size; @@ -513,7 +562,7 @@ void ExtractImagePatches::ExtractImagePatchesJitExecutor::executeOptimizedGeneri args.w_hi_pad = iw_hpad; (*pKernel)(&args); }); -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 } jit_extract_image_patches_params ExtractImagePatches::ExtractImagePatchesExecutor::fillJpp( @@ -584,14 +633,13 @@ jit_extract_image_patches_params ExtractImagePatches::ExtractImagePatchesExecuto return jpp; } -ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize) { +ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize) { #if defined(OPENVINO_ARCH_X86_64) auto jpp = fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize); if (mayiuse(x64::avx512_core)) { @@ -606,27 +654,31 @@ ExtractImagePatches::ExtractImagePatchesJitExecutor::ExtractImagePatchesJitExecu if (pKernel) pKernel->create_ker(); -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 } -void ExtractImagePatches::ExtractImagePatchesJitExecutor::exec( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) { +void ExtractImagePatches::ExtractImagePatchesJitExecutor::exec(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) { if (!pKernel) OPENVINO_THROW("Can't execute, kernel for extract image patches node is not compiled"); executeOptimizedGeneric(src, dst, istrides, ostrides); } -ExtractImagePatches::ExtractImagePatchesRefExecutor::ExtractImagePatchesRefExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize) : jpp(fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize)) {} - -void ExtractImagePatches::ExtractImagePatchesRefExecutor::exec( - void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) { +ExtractImagePatches::ExtractImagePatchesRefExecutor::ExtractImagePatchesRefExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize) + : jpp(fillJpp(inDims, outDims, kSizes, strides, rates, padType, prcSize)) {} + +void ExtractImagePatches::ExtractImagePatchesRefExecutor::exec(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) { executeReference(src, dst, istrides, ostrides); } @@ -636,6 +688,6 @@ bool ExtractImagePatches::created() const { return getType() == Type::ExtractImagePatches; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/extract_image_patches.h b/src/plugins/intel_cpu/src/nodes/extract_image_patches.h index 15220fd51a4285..1844b5cafeeb07 100644 --- a/src/plugins/intel_cpu/src/nodes/extract_image_patches.h +++ b/src/plugins/intel_cpu/src/nodes/extract_image_patches.h @@ -30,8 +30,11 @@ struct jit_extract_image_patches_args { }; struct jit_uni_extract_image_patches_kernel { - void (*ker_)(const jit_extract_image_patches_args *); - void operator()(const jit_extract_image_patches_args *args) { assert(ker_); ker_(args); } + void (*ker_)(const jit_extract_image_patches_args*); + void operator()(const jit_extract_image_patches_args* args) { + assert(ker_); + ker_(args); + } jit_extract_image_patches_params jpp; virtual void create_ker() = 0; explicit jit_uni_extract_image_patches_kernel(jit_extract_image_patches_params jpp) : ker_(nullptr), jpp(jpp) {} @@ -42,7 +45,7 @@ class ExtractImagePatches : public Node { public: ExtractImagePatches(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -51,11 +54,7 @@ class ExtractImagePatches : public Node { void prepareParams() override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - enum class ExtImgPatcherPadType { - VALID, - SAME_LOWER, - SAME_UPPER - }; + enum class ExtImgPatcherPadType { VALID, SAME_LOWER, SAME_UPPER }; private: std::vector _ksizes; @@ -69,14 +68,13 @@ class ExtractImagePatches : public Node { struct ExtractImagePatchesExecutor { ExtractImagePatchesExecutor() = default; virtual void exec(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) = 0; - jit_extract_image_patches_params fillJpp( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize); + jit_extract_image_patches_params fillJpp(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize); virtual ~ExtractImagePatchesExecutor() = default; protected: @@ -93,30 +91,31 @@ class ExtractImagePatches : public Node { executorPtr execPtr = nullptr; struct ExtractImagePatchesJitExecutor : public ExtractImagePatchesExecutor { - ExtractImagePatchesJitExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize); + ExtractImagePatchesJitExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize); void exec(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) override; - void executeOptimizedGeneric(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const; + void executeOptimizedGeneric(void* src, + void* dst, + const VectorDims& istrides, + const VectorDims& ostrides) const; private: std::unique_ptr pKernel; }; struct ExtractImagePatchesRefExecutor : public ExtractImagePatchesExecutor { - ExtractImagePatchesRefExecutor( - const VectorDims& inDims, - const VectorDims& outDims, - const VectorDims& kSizes, - const VectorDims& strides, - const VectorDims& rates, - const ExtImgPatcherPadType& padType, - const size_t prcSize); + ExtractImagePatchesRefExecutor(const VectorDims& inDims, + const VectorDims& outDims, + const VectorDims& kSizes, + const VectorDims& strides, + const VectorDims& rates, + const ExtImgPatcherPadType& padType, + const size_t prcSize); void exec(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) override; void executeReference(void* src, void* dst, const VectorDims& istrides, const VectorDims& ostrides) const; @@ -125,6 +124,6 @@ class ExtractImagePatches : public Node { }; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eye.cpp b/src/plugins/intel_cpu/src/nodes/eye.cpp index f1e78b04510914..deb47abdba2dee 100644 --- a/src/plugins/intel_cpu/src/nodes/eye.cpp +++ b/src/plugins/intel_cpu/src/nodes/eye.cpp @@ -3,10 +3,12 @@ // #include "eye.h" -#include "openvino/op/eye.hpp" + #include + #include "openvino/core/parallel.hpp" -#include "shape_inference/shape_inference_ngraph.hpp" +#include "openvino/op/eye.hpp" +#include "shape_inference/shape_inference.hpp" #include "utils/bfloat16.hpp" #define THROW_ERROR(...) OPENVINO_THROW(NameFromType(getType()), " node with name '", getName(), "' ", __VA_ARGS__) @@ -33,28 +35,24 @@ class EyeShapeInferFactory : public ShapeInferFactory { public: EyeShapeInferFactory(std::shared_ptr op) : m_op(op) {} ShapeInferPtr makeShapeInfer() const override { - IShapeInfer::port_mask_t port_mask = EMPTY_PORT_MASK; - if (m_op->get_input_size() == 4) { - port_mask = PortMask(Eye::ROWS_NUM, Eye::COLS_NUM, Eye::DIAGONAL_INDEX, Eye::BATCH_SHAPE); - } else { - port_mask = PortMask(Eye::ROWS_NUM, Eye::COLS_NUM, Eye::DIAGONAL_INDEX); - } - return std::make_shared(make_shape_inference(m_op), port_mask); + return (m_op->get_input_size() == 4) ? make_shape_inference(m_op) + : make_shape_inference(m_op, PortMask(Eye::ROWS_NUM, Eye::COLS_NUM)); } + private: std::shared_ptr m_op; }; -} // namespace +} // namespace -Eye::Eye(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, EyeShapeInferFactory(op)) { +Eye::Eye(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, EyeShapeInferFactory(op)) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { - OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } outType = op->get_output_element_type(0); withBatchShape = (op->get_input_size() == 4); - if (!one_of(outType, ov::element::f32, ov::element::bf16, - ov::element::i32, ov::element::i8, ov::element::u8)) { + if (!one_of(outType, ov::element::f32, ov::element::bf16, ov::element::i32, ov::element::i8, ov::element::u8)) { THROW_ERROR(errorPrefix, "doesn't support demanded output precision"); } } @@ -66,16 +64,19 @@ void Eye::getSupportedDescriptors() { THROW_ERROR(errorPrefix, "has incorrect number of output edges: ", getChildEdges().size()); } -template +template struct Eye::EyeExecute { - void operator()(Eye *node) { + void operator()(Eye* node) { node->executeSpecified(); } }; void Eye::execute(dnnl::stream strm) { auto outputPrec = getChildEdgeAt(0)->getMemory().getDesc().getPrecision(); - OV_SWITCH(intel_cpu, EyeExecute, this, outputPrec, + OV_SWITCH(intel_cpu, + EyeExecute, + this, + outputPrec, OV_CASE(ov::element::f32, float), OV_CASE(ov::element::bf16, bfloat16_t), OV_CASE(ov::element::i32, int), @@ -104,9 +105,9 @@ void Eye::executeSpecified() { const size_t colNum = getColNum(); const int64_t shift = getDiagIndex(); auto outPtr = getDstMemoryAtPort(0); - if (!outPtr || !outPtr ->isDefined()) + if (!outPtr || !outPtr->isDefined()) THROW_ERROR(errorPrefix, "Destination memory is undefined."); - T *dst = outPtr->getDataAs(); + T* dst = outPtr->getDataAs(); const size_t batchVolume = getBatchVolume(getBatchShape()); const size_t spatialCount = colNum * rowNum; @@ -116,8 +117,8 @@ void Eye::executeSpecified() { const int64_t countByColumns = std::max(int64_t(colNum) - std::abs(shift), int64_t(0)); const int64_t countByRows = std::max(int64_t(rowNum) - std::abs(shift), int64_t(0)); - const size_t onesPerBatchNum = - static_cast(shift > 0 ? std::min(countByColumns, int64_t(rowNum)) : std::min(countByRows, int64_t(colNum))); + const size_t onesPerBatchNum = static_cast(shift > 0 ? std::min(countByColumns, int64_t(rowNum)) + : std::min(countByRows, int64_t(colNum))); const size_t dataShift = static_cast(shift >= 0 ? shift : -shift * colNum); if (spatialSize >= l2CacheSize) { @@ -126,7 +127,8 @@ void Eye::executeSpecified() { splitter(elementsCount, nthr, ithr, start, end); memset(dst + start, 0, (end - start) * sizeof(T)); }); - if (onesPerBatchNum == 0) return; + if (onesPerBatchNum == 0) + return; for (size_t bShift = 0; bShift < batchVolume * spatialCount; bShift += spatialCount) { parallel_nt(0, [&](const size_t ithr, const size_t nthr) { size_t start = 0, end = 0; @@ -141,7 +143,8 @@ void Eye::executeSpecified() { size_t start = 0, end = 0; splitter(batchVolume, nthr, ithr, start, end); memset(dst + start * spatialCount, 0, (end - start) * spatialSize); - if (onesPerBatchNum == 0) return; + if (onesPerBatchNum == 0) + return; for (size_t spShift = start * spatialCount; spShift < end * spatialCount; spShift += spatialCount) { for (size_t j = 0; j < onesPerBatchNum; j++) { dst[dataShift + j * (colNum + 1) + spShift] = static_cast(1); @@ -154,6 +157,6 @@ void Eye::executeSpecified() { bool Eye::created() const { return getType() == Type::Eye; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/eye.h b/src/plugins/intel_cpu/src/nodes/eye.h index 7978c45d8a05d1..fc2b42a18bdbe9 100644 --- a/src/plugins/intel_cpu/src/nodes/eye.h +++ b/src/plugins/intel_cpu/src/nodes/eye.h @@ -5,9 +5,11 @@ #pragma once #include -#include + #include +#include #include + #include "dnnl_extension_utils.h" namespace ov { @@ -28,9 +30,15 @@ class Eye : public Node { void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; - bool needPrepareParams() const override {return false;}; - bool needShapeInfer() const override {return true;}; - void executeDynamicImpl(dnnl::stream strm) override { execute(strm); } + bool needPrepareParams() const override { + return false; + }; + bool needShapeInfer() const override { + return true; + }; + void executeDynamicImpl(dnnl::stream strm) override { + execute(strm); + } static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; @@ -39,13 +47,13 @@ class Eye : public Node { ov::element::Type outType = ov::element::Type_t::undefined; template void executeSpecified(); - template + template struct EyeExecute; inline const size_t getRowNum() const { auto rowMem = getSrcMemoryAtPort(ROWS_NUM); if (rowMem == nullptr) OPENVINO_THROW(errorPrefix, " doesn't contain row_count data"); - const int *rowPtr = rowMem->getDataAs(); + const int* rowPtr = rowMem->getDataAs(); return rowPtr[0]; } @@ -53,7 +61,7 @@ class Eye : public Node { auto colMem = getSrcMemoryAtPort(COLS_NUM); if (colMem == nullptr) OPENVINO_THROW(errorPrefix, " doesn't contain col_count data"); - const int *colPtr = colMem->getDataAs(); + const int* colPtr = colMem->getDataAs(); return colPtr[0]; } @@ -61,28 +69,29 @@ class Eye : public Node { auto diagIndMem = getSrcMemoryAtPort(DIAGONAL_INDEX); if (diagIndMem == nullptr) OPENVINO_THROW(errorPrefix, " doesn't contain diag_index data"); - const int *diagIndexPtr = diagIndMem->getDataAs(); + const int* diagIndexPtr = diagIndMem->getDataAs(); return diagIndexPtr[0]; } inline const std::vector getBatchShape() const { if (withBatchShape) { - const int batchShapeSize = static_cast(getSrcMemoryAtPort(BATCH_SHAPE)->getShape().getElementsCount()); + const int batchShapeSize = + static_cast(getSrcMemoryAtPort(BATCH_SHAPE)->getShape().getElementsCount()); std::vector batchShape(batchShapeSize); - const int *batchShapePtr = getSrcDataAtPortAs(BATCH_SHAPE); + const int* batchShapePtr = getSrcDataAtPortAs(BATCH_SHAPE); batchShape.assign(batchShapePtr, batchShapePtr + batchShapeSize); return batchShape; } else { - return std::vector {}; + return std::vector{}; } } - inline const size_t getBatchVolume(const std::vector &batchShape) { + inline const size_t getBatchVolume(const std::vector& batchShape) { return std::accumulate(begin(batchShape), end(batchShape), 1, std::multiplies()); } bool withBatchShape = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp index f12ab40cf5643b..9951c5176f0ad1 100644 --- a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp +++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp @@ -4,28 +4,27 @@ #include "fake_quantize.h" -#include -#include #include +#include + #include -#include #include - -#include "dnnl_types.h" -#include "dnnl_extension_utils.h" -#include "cpu/x64/jit_generator.hpp" #include +#include +#include +#include +#include -#include "openvino/core/parallel.hpp" -#include "utils/general_utils.h" -#include "utils/cpu_utils.hpp" -#include -#include "memory_desc/dnnl_blocked_memory_desc.h" #include "common/cpu_memcpy.h" #include "common/primitive_hashing_utils.hpp" -#include - +#include "cpu/x64/jit_generator.hpp" +#include "dnnl_extension_utils.h" +#include "dnnl_types.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "openvino/core/parallel.hpp" #include "openvino/opsets/opset1.hpp" +#include "utils/cpu_utils.hpp" +#include "utils/general_utils.h" #include "utils/ngraph_utils.hpp" // Quantization ranges validation is switched off by default in order to avoid regressions on user side @@ -45,13 +44,15 @@ namespace ov { namespace intel_cpu { namespace node { #if defined(OPENVINO_ARCH_X86_64) -#define GET_OFF(field) offsetof(jit_quantize_call_args, field) +# define GET_OFF(field) offsetof(jit_quantize_call_args, field) template struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_binarization_kernel) - explicit jit_uni_binarization_kernel(const jit_quantize_params& jqp) : jit_uni_quantize_kernel(jqp), jit_generator(jit_name()) {} + explicit jit_uni_binarization_kernel(const jit_quantize_params& jqp) + : jit_uni_quantize_kernel(jqp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -77,7 +78,8 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ Label tail_label; Label exit_label; - L(unrolled_loop_label); { + L(unrolled_loop_label); + { int step = isa == cpu::x64::sse41 ? nbits / 2 : isa == cpu::x64::avx2 ? nbits : 2 * nbits; const int ur_ch = isa == cpu::x64::sse41 ? nbits : isa == cpu::x64::avx2 ? nbits / 2 : nbits / 4; const int unrolled_loop_step = ur_ch * step; @@ -87,9 +89,9 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ xor_(reg_bin_32, reg_bin_32); for (int ch = 0; ch < ur_ch; ch++) { - uni_vmovups(vmm_src(0), ptr[reg_from + ch*step*sizeof(float)]); - uni_vmovups(vmm_wei(0), ptr[reg_thresholds + ch*step*sizeof(float)]); - uni_vmovups(vmm_mask(0), ptr[reg_output_mask + ch*step*sizeof(float)]); + uni_vmovups(vmm_src(0), ptr[reg_from + ch * step * sizeof(float)]); + uni_vmovups(vmm_wei(0), ptr[reg_thresholds + ch * step * sizeof(float)]); + uni_vmovups(vmm_mask(0), ptr[reg_output_mask + ch * step * sizeof(float)]); if (isa == avx512_core) { vcmpps(k_mask0, vmm_src(0), vmm_wei(0), _cmp_gt_os); vptestmd(k_mask1, vmm_mask(0), vmm_mask(0)); @@ -105,16 +107,17 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ } mov(ptr[reg_to], reg_bin_32); - add(reg_from, unrolled_loop_step*sizeof(float)); - add(reg_thresholds, unrolled_loop_step*sizeof(float)); - add(reg_output_mask, unrolled_loop_step*sizeof(float)); + add(reg_from, unrolled_loop_step * sizeof(float)); + add(reg_thresholds, unrolled_loop_step * sizeof(float)); + add(reg_output_mask, unrolled_loop_step * sizeof(float)); add(reg_to, sizeof(uint32_t)); sub(reg_work_amount, unrolled_loop_step); jmp(unrolled_loop_label, T_NEAR); } - L(main_loop_label); { + L(main_loop_label); + { int repeats = isa == cpu::x64::sse41 ? 2 : 1; int step = isa == cpu::x64::sse41 ? nbits / 2 : isa == cpu::x64::avx2 ? nbits : nbits * 2; const int main_loop_step = step * repeats; @@ -124,9 +127,9 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ xor_(reg_bin_32, reg_bin_32); for (int i = 0; i < repeats; i++) { - uni_vmovups(vmm_src(0), ptr[reg_from + i*step*sizeof(float)]); - uni_vmovups(vmm_wei(0), ptr[reg_thresholds + i*step*sizeof(float)]); - uni_vmovups(vmm_mask(0), ptr[reg_output_mask + i*step*sizeof(float)]); + uni_vmovups(vmm_src(0), ptr[reg_from + i * step * sizeof(float)]); + uni_vmovups(vmm_wei(0), ptr[reg_thresholds + i * step * sizeof(float)]); + uni_vmovups(vmm_mask(0), ptr[reg_output_mask + i * step * sizeof(float)]); if (isa == avx512_core) { vcmpps(k_mask0, vmm_src(0), vmm_wei(0), _cmp_gt_os); vptestmd(k_mask1, vmm_mask(0), vmm_mask(0)); @@ -145,16 +148,17 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ else mov(ptr[reg_to], reg_bin_8); - add(reg_from, main_loop_step*sizeof(float)); - add(reg_thresholds, main_loop_step*sizeof(float)); - add(reg_output_mask, main_loop_step*sizeof(float)); + add(reg_from, main_loop_step * sizeof(float)); + add(reg_thresholds, main_loop_step * sizeof(float)); + add(reg_output_mask, main_loop_step * sizeof(float)); add(reg_to, isa == avx512_core ? sizeof(uint16_t) : sizeof(uint8_t)); sub(reg_work_amount, main_loop_step); jmp(main_loop_label, T_NEAR); } - L(tail_label); { + L(tail_label); + { if (tail_size != 0) { xor_(reg_bin_32, reg_bin_32); mov(reg_mask, 1); @@ -188,15 +192,27 @@ struct jit_uni_binarization_kernel : public jit_uni_quantize_kernel, public jit_ } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; - inline Vmm vmm_src(int idx) { return Vmm(idx); } - inline Xmm xmm_src(int idx) { return Xmm(idx); } - inline Vmm vmm_wei(int idx) { return Vmm(idx + 4); } - inline Vmm vmm_mask(int idx) { return Vmm(idx + 5); } - inline Xmm xmm_wei(int idx) { return Xmm(idx + 4); } - inline Xmm xmm_mask(int idx) { return Xmm(idx + 5); } + inline Vmm vmm_src(int idx) { + return Vmm(idx); + } + inline Xmm xmm_src(int idx) { + return Xmm(idx); + } + inline Vmm vmm_wei(int idx) { + return Vmm(idx + 4); + } + inline Vmm vmm_mask(int idx) { + return Vmm(idx + 5); + } + inline Xmm xmm_wei(int idx) { + return Xmm(idx + 4); + } + inline Xmm xmm_mask(int idx) { + return Xmm(idx + 5); + } Reg64 param = abi_param1; Reg64 reg_from = r8; @@ -219,7 +235,9 @@ template struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_quantization_kernel) - explicit jit_uni_quantization_kernel(const jit_quantize_params& jqp) : jit_uni_quantize_kernel(jqp), jit_generator(jit_name()) {} + explicit jit_uni_quantization_kernel(const jit_quantize_params& jqp) + : jit_uni_quantize_kernel(jqp), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -237,37 +255,78 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ else compute_generic(); - this->postamble(); } private: - using Vmm = typename conditional3::type; - - inline Vmm vmm_val(int idx) { return Vmm(idx + 0); } - inline Vmm vmm_crop_low(int idx) { return Vmm(idx + 2); } - inline Vmm vmm_crop_high(int idx) { return Vmm(idx + 4); } - inline Vmm vmm_input_scale(int idx) { return Vmm(idx + 6); } - inline Vmm vmm_input_shift(int idx) { return Vmm(idx + 8); } - inline Vmm vmm_output_scale(int idx) { return Vmm(idx + 10); } - inline Vmm vmm_output_shift(int idx) { return Vmm(idx + 12); } - - inline Ymm ymm_val(int idx) { return Ymm(idx + 0); } - inline Ymm ymm_crop_low(int idx) { return Ymm(idx + 2); } - inline Ymm ymm_crop_high(int idx) { return Ymm(idx + 4); } - inline Ymm ymm_input_scale(int idx) { return Ymm(idx + 6); } - inline Ymm ymm_input_shift(int idx) { return Ymm(idx + 8); } - inline Ymm ymm_output_scale(int idx) { return Ymm(idx + 10); } - inline Ymm ymm_output_shift(int idx) { return Ymm(idx + 12); } - - inline Xmm xmm_val(int idx) { return Xmm(idx + 0); } - inline Xmm xmm_crop_low(int idx) { return Xmm(idx + 2); } - inline Xmm xmm_crop_high(int idx) { return Xmm(idx + 4); } - inline Xmm xmm_input_scale(int idx) { return Xmm(idx + 6); } - inline Xmm xmm_input_shift(int idx) { return Xmm(idx + 8); } - inline Xmm xmm_output_scale(int idx) { return Xmm(idx + 10); } - inline Xmm xmm_output_shift(int idx) { return Xmm(idx + 12); } + using Vmm = + typename conditional3::type; + + inline Vmm vmm_val(int idx) { + return Vmm(idx + 0); + } + inline Vmm vmm_crop_low(int idx) { + return Vmm(idx + 2); + } + inline Vmm vmm_crop_high(int idx) { + return Vmm(idx + 4); + } + inline Vmm vmm_input_scale(int idx) { + return Vmm(idx + 6); + } + inline Vmm vmm_input_shift(int idx) { + return Vmm(idx + 8); + } + inline Vmm vmm_output_scale(int idx) { + return Vmm(idx + 10); + } + inline Vmm vmm_output_shift(int idx) { + return Vmm(idx + 12); + } + + inline Ymm ymm_val(int idx) { + return Ymm(idx + 0); + } + inline Ymm ymm_crop_low(int idx) { + return Ymm(idx + 2); + } + inline Ymm ymm_crop_high(int idx) { + return Ymm(idx + 4); + } + inline Ymm ymm_input_scale(int idx) { + return Ymm(idx + 6); + } + inline Ymm ymm_input_shift(int idx) { + return Ymm(idx + 8); + } + inline Ymm ymm_output_scale(int idx) { + return Ymm(idx + 10); + } + inline Ymm ymm_output_shift(int idx) { + return Ymm(idx + 12); + } + + inline Xmm xmm_val(int idx) { + return Xmm(idx + 0); + } + inline Xmm xmm_crop_low(int idx) { + return Xmm(idx + 2); + } + inline Xmm xmm_crop_high(int idx) { + return Xmm(idx + 4); + } + inline Xmm xmm_input_scale(int idx) { + return Xmm(idx + 6); + } + inline Xmm xmm_input_shift(int idx) { + return Xmm(idx + 8); + } + inline Xmm xmm_output_scale(int idx) { + return Xmm(idx + 10); + } + inline Xmm xmm_output_shift(int idx) { + return Xmm(idx + 12); + } Vmm vmm_zero = Vmm(14); @@ -296,24 +355,34 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ bool do_dequantization = true; inline void load_broadcasted_vectors_only(size_t idx) { - const auto &broadcasted = jqp_.broadcasted; - if (broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) uni_vbroadcastss(vmm_crop_low(idx), ptr[reg_crop_low]); - if (broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) uni_vbroadcastss(vmm_crop_high(idx), ptr[reg_crop_high]); - if (broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) uni_vbroadcastss(vmm_input_scale(idx), ptr[reg_input_scale]); - if (broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) uni_vbroadcastss(vmm_input_shift(idx), ptr[reg_input_shift]); + const auto& broadcasted = jqp_.broadcasted; + if (broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + uni_vbroadcastss(vmm_crop_low(idx), ptr[reg_crop_low]); + if (broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + uni_vbroadcastss(vmm_crop_high(idx), ptr[reg_crop_high]); + if (broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + uni_vbroadcastss(vmm_input_scale(idx), ptr[reg_input_scale]); + if (broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + uni_vbroadcastss(vmm_input_shift(idx), ptr[reg_input_shift]); if (do_dequantization) { - if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) uni_vbroadcastss(vmm_output_scale(idx), ptr[reg_output_scale]); - if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) uni_vbroadcastss(vmm_output_shift(idx), ptr[reg_output_shift]); + if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) + uni_vbroadcastss(vmm_output_scale(idx), ptr[reg_output_scale]); + if (broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) + uni_vbroadcastss(vmm_output_shift(idx), ptr[reg_output_shift]); } } template inline void load_not_broadcasted_vectors_only(size_t idx, size_t offset) { - const auto &broadcasted = jqp_.broadcasted; - if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) uni_vmovups(T(vmm_crop_low(idx).getIdx()), ptr[reg_crop_low + offset]); - if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) uni_vmovups(T(vmm_crop_high(idx).getIdx()), ptr[reg_crop_high + offset]); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) uni_vmovups(T(vmm_input_scale(idx).getIdx()), ptr[reg_input_scale + offset]); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) uni_vmovups(T(vmm_input_shift(idx).getIdx()), ptr[reg_input_shift + offset]); + const auto& broadcasted = jqp_.broadcasted; + if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + uni_vmovups(T(vmm_crop_low(idx).getIdx()), ptr[reg_crop_low + offset]); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + uni_vmovups(T(vmm_crop_high(idx).getIdx()), ptr[reg_crop_high + offset]); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + uni_vmovups(T(vmm_input_scale(idx).getIdx()), ptr[reg_input_scale + offset]); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + uni_vmovups(T(vmm_input_shift(idx).getIdx()), ptr[reg_input_shift + offset]); if (do_dequantization) { if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) uni_vmovups(T(vmm_output_scale(idx).getIdx()), ptr[reg_output_scale + offset]); @@ -323,14 +392,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } inline void increase_ptrs_if_not_broadcasted(size_t offset) { - const auto &broadcasted = jqp_.broadcasted; - if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) add(reg_crop_low, offset); - if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) add(reg_crop_high, offset); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) add(reg_input_scale, offset); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) add(reg_input_shift, offset); + const auto& broadcasted = jqp_.broadcasted; + if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + add(reg_crop_low, offset); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + add(reg_crop_high, offset); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + add(reg_input_scale, offset); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + add(reg_input_shift, offset); if (do_dequantization) { - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) add(reg_output_scale, offset); - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) add(reg_output_shift, offset); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) + add(reg_output_scale, offset); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) + add(reg_output_shift, offset); } } @@ -373,7 +448,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vbroadcastss(vmm_output_shift(0), ptr[reg_output_shift]); } - L(main_loop_label); { + L(main_loop_label); + { cmp(reg_work_amount, simd_w); jl(tail_blk4_label, T_NEAR); @@ -383,8 +459,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(vmm_val(i), vmm_val(i), vmm_crop_high(0)); uni_vmaxps(vmm_val(i), vmm_val(i), vmm_crop_low(0)); uni_vfmadd213ps(vmm_val(i), vmm_input_scale(0), vmm_input_shift(0)); - if (do_rounding) uni_vroundps(vmm_val(i), vmm_val(i), 0); - if (do_dequantization) uni_vfmadd213ps(vmm_val(i), vmm_output_scale(0), vmm_output_shift(0)); + if (do_rounding) + uni_vroundps(vmm_val(i), vmm_val(i), 0); + if (do_dequantization) + uni_vfmadd213ps(vmm_val(i), vmm_output_scale(0), vmm_output_shift(0)); store_vector(ptr[reg_to + i * (simd_w / 2) * dst_type_size], vmm_val(i), jqp_.dst_prc); } @@ -396,7 +474,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ jmp(main_loop_label, T_NEAR); } - L(tail_blk4_label); { + L(tail_blk4_label); + { cmp(reg_work_amount, tail_simd_w); jl(tail_blk4_exit_label, T_NEAR); @@ -405,8 +484,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_vector(ptr[reg_to], xmm_val(0), jqp_.dst_prc); @@ -420,7 +501,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ mov(aux_reg_from, reg_from); mov(aux_reg_to, reg_to); - L(tail_loop_label); { + L(tail_loop_label); + { cmp(reg_work_amount, 0); jle(exit_label, T_NEAR); @@ -429,8 +511,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_scalar(ptr[aux_reg_to], xmm_val(0), jqp_.dst_prc); @@ -496,7 +580,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ load_not_broadcasted_vectors_only(i, i * (simd_w / 2) * sizeof(float)); } - L(main_loop_label); { + L(main_loop_label); + { cmp(reg_work_amount, 0); jle(exit_label, T_NEAR); @@ -506,8 +591,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(vmm_val(i), vmm_val(i), vmm_crop_high(i)); uni_vmaxps(vmm_val(i), vmm_val(i), vmm_crop_low(i)); uni_vfmadd213ps(vmm_val(i), vmm_input_scale(i), vmm_input_shift(i)); - if (do_rounding) uni_vroundps(vmm_val(i), vmm_val(i), 0); - if (do_dequantization) uni_vfmadd213ps(vmm_val(i), vmm_output_scale(i), vmm_output_shift(i)); + if (do_rounding) + uni_vroundps(vmm_val(i), vmm_val(i), 0); + if (do_dequantization) + uni_vfmadd213ps(vmm_val(i), vmm_output_scale(i), vmm_output_shift(i)); store_vector(ptr[reg_to + i * (simd_w / 2) * dst_type_size], vmm_val(i), jqp_.dst_prc); } @@ -531,7 +618,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ load_not_broadcasted_vectors_only(0, 0); - L(tail_blk8_loop_label); { + L(tail_blk8_loop_label); + { cmp(reg_work_amount, 0); jle(tail_blk8_exit_label, T_NEAR); @@ -540,8 +628,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(ymm_val(0), ymm_val(0), ymm_crop_high(0)); uni_vmaxps(ymm_val(0), ymm_val(0), ymm_crop_low(0)); uni_vfmadd213ps(ymm_val(0), ymm_input_scale(0), ymm_input_shift(0)); - if (do_rounding) uni_vroundps(ymm_val(0), ymm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(ymm_val(0), ymm_output_scale(0), ymm_output_shift(0)); + if (do_rounding) + uni_vroundps(ymm_val(0), ymm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(ymm_val(0), ymm_output_scale(0), ymm_output_shift(0)); store_vector(ptr[aux_reg_to], ymm_val(0), jqp_.dst_prc); @@ -571,7 +661,8 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ load_not_broadcasted_vectors_only(0, 0); - L(tail_blk4_loop_label); { + L(tail_blk4_loop_label); + { cmp(reg_work_amount, 0); jle(tail_blk4_exit_label, T_NEAR); @@ -580,8 +671,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_vector(ptr[aux_reg_to], xmm_val(0), jqp_.dst_prc); @@ -608,13 +701,14 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ mov(aux_reg_from, reg_from); mov(reg_work_amount, ptr[param + GET_OFF(work_amount)]); - L(tail_loop_label); { + L(tail_loop_label); + { cmp(reg_work_amount, 0); jle(exit_label, T_NEAR); Label end_unroll; auto tail_unroll = [&](size_t iter) { - const auto &broadcasted = jqp_.broadcasted; + const auto& broadcasted = jqp_.broadcasted; for (size_t i = 0; i < iter; i++) { if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) uni_vmovss(xmm_crop_low(0), ptr[reg_crop_low + i * wei_type_size]); @@ -636,8 +730,10 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ uni_vminps(xmm_val(0), xmm_val(0), xmm_crop_high(0)); uni_vmaxps(xmm_val(0), xmm_val(0), xmm_crop_low(0)); uni_vfmadd213ps(xmm_val(0), xmm_input_scale(0), xmm_input_shift(0)); - if (do_rounding) uni_vroundps(xmm_val(0), xmm_val(0), 0); - if (do_dequantization) uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); + if (do_rounding) + uni_vroundps(xmm_val(0), xmm_val(0), 0); + if (do_dequantization) + uni_vfmadd213ps(xmm_val(0), xmm_output_scale(0), xmm_output_shift(0)); store_scalar(ptr[aux_reg_to + i * dst_type_size], xmm_val(0), jqp_.dst_prc); } @@ -667,20 +763,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ L(exit_label); } - inline void load_vector(Zmm zmm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_vector(Zmm zmm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(zmm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(zmm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(zmm_src, op); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(zmm_src, op); + break; + case ov::element::i8: + uni_vpmovsxbd(zmm_src, op); + break; + case ov::element::u8: + uni_vpmovzxbd(zmm_src, op); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -688,20 +784,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void load_vector(Ymm ymm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_vector(Ymm ymm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(ymm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(ymm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(ymm_src, op); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(ymm_src, op); + break; + case ov::element::i8: + uni_vpmovsxbd(ymm_src, op); + break; + case ov::element::u8: + uni_vpmovzxbd(ymm_src, op); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -709,20 +805,20 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void load_vector(Xmm xmm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_vector(Xmm xmm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(xmm_src, op); - break; - case ov::element::i8: - uni_vpmovsxbd(xmm_src, op); - break; - case ov::element::u8: - uni_vpmovzxbd(xmm_src, op); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(xmm_src, op); + break; + case ov::element::i8: + uni_vpmovsxbd(xmm_src, op); + break; + case ov::element::u8: + uni_vpmovzxbd(xmm_src, op); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -730,22 +826,22 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, ov::element::Type src_prc) { + inline void load_scalar(Xmm xmm_src, const Xbyak::Address& op, ov::element::Type src_prc) { switch (src_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovss(xmm_src, op); - break; - case ov::element::i8: - movsx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - case ov::element::u8: - movzx(reg_tmp_32, op); - uni_vmovq(xmm_src, reg_tmp_64); - break; - default: - assert(!"unknown src_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovss(xmm_src, op); + break; + case ov::element::i8: + movsx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + case ov::element::u8: + movzx(reg_tmp_32, op); + uni_vmovq(xmm_src, reg_tmp_64); + break; + default: + assert(!"unknown src_prc"); } if (src_prc != ov::element::f32) { @@ -753,29 +849,29 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } } - inline void store_vector(const Xbyak::Address &op, Zmm zmm_dst, ov::element::Type dst_prc) { + inline void store_vector(const Xbyak::Address& op, Zmm zmm_dst, ov::element::Type dst_prc) { if (dst_prc != ov::element::f32) { uni_vcvtps2dq(zmm_dst, zmm_dst); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, zmm_dst); - break; - case ov::element::i8: - vpmovsdb(op, zmm_dst); - break; - case ov::element::u8: - vpmaxsd(zmm_dst, zmm_dst, vmm_zero); - vpmovusdb(op, zmm_dst); - break; - default: - assert(!"unknown dst_prc"); - } - } - - inline void store_vector(const Xbyak::Address &op, Ymm ymm_dst, ov::element::Type dst_prc) { + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, zmm_dst); + break; + case ov::element::i8: + vpmovsdb(op, zmm_dst); + break; + case ov::element::u8: + vpmaxsd(zmm_dst, zmm_dst, vmm_zero); + vpmovusdb(op, zmm_dst); + break; + default: + assert(!"unknown dst_prc"); + } + } + + inline void store_vector(const Xbyak::Address& op, Ymm ymm_dst, ov::element::Type dst_prc) { Xmm xmm_dst = Xmm(ymm_dst.getIdx()); if (dst_prc != ov::element::f32) { @@ -783,82 +879,82 @@ struct jit_uni_quantization_kernel : public jit_uni_quantize_kernel, public jit_ } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, ymm_dst); - break; - case ov::element::i8: - uni_vpackssdw(ymm_dst, ymm_dst, ymm_dst); + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, ymm_dst); + break; + case ov::element::i8: + uni_vpackssdw(ymm_dst, ymm_dst, ymm_dst); - vpermq(ymm_dst, ymm_dst, 0x08); + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpacksswb(ymm_dst, ymm_dst, ymm_dst); + uni_vpacksswb(ymm_dst, ymm_dst, ymm_dst); - vmovq(op, xmm_dst); - break; - case ov::element::u8: - uni_vpackusdw(ymm_dst, ymm_dst, ymm_dst); + vmovq(op, xmm_dst); + break; + case ov::element::u8: + uni_vpackusdw(ymm_dst, ymm_dst, ymm_dst); - vpermq(ymm_dst, ymm_dst, 0x08); + vpermq(ymm_dst, ymm_dst, 0x08); - uni_vpackuswb(ymm_dst, ymm_dst, ymm_dst); + uni_vpackuswb(ymm_dst, ymm_dst, ymm_dst); - vmovq(op, xmm_dst); - break; - default: - assert(!"unknown dst_prc"); + vmovq(op, xmm_dst); + break; + default: + assert(!"unknown dst_prc"); } } - inline void store_vector(const Xbyak::Address &op, Xmm xmm_dst, ov::element::Type dst_prc) { + inline void store_vector(const Xbyak::Address& op, Xmm xmm_dst, ov::element::Type dst_prc) { if (dst_prc != ov::element::f32) { uni_vcvtps2dq(xmm_dst, xmm_dst); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovups(op, xmm_dst); - break; - case ov::element::i8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovd(op, xmm_dst); - break; - case ov::element::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovd(op, xmm_dst); - break; - default: - assert(!"unknown dst_prc"); - } - } - - inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, ov::element::Type dst_prc) { + case ov::element::f32: + case ov::element::i32: + uni_vmovups(op, xmm_dst); + break; + case ov::element::i8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovd(op, xmm_dst); + break; + case ov::element::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovd(op, xmm_dst); + break; + default: + assert(!"unknown dst_prc"); + } + } + + inline void store_scalar(const Xbyak::Address& op, Xmm xmm_dst, ov::element::Type dst_prc) { if (dst_prc != ov::element::f32) { uni_vcvtps2dq(xmm_dst, xmm_dst); } switch (dst_prc) { - case ov::element::f32: - case ov::element::i32: - uni_vmovss(op, xmm_dst); - break; - case ov::element::i8: - uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - case ov::element::u8: - uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); - uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); - uni_vmovq(reg_tmp_64, xmm_dst); - mov(op, reg_tmp_8); - break; - default: - assert(!"unknown dst_prc"); + case ov::element::f32: + case ov::element::i32: + uni_vmovss(op, xmm_dst); + break; + case ov::element::i8: + uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + case ov::element::u8: + uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst); + uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst); + uni_vmovq(reg_tmp_64, xmm_dst); + mov(op, reg_tmp_8); + break; + default: + assert(!"unknown dst_prc"); } } }; @@ -877,7 +973,8 @@ bool FakeQuantize::isSupportedOperation(const std::shared_ptr& o } for (size_t i = 1; i < fq->get_input_size(); i++) { if (fq->get_input_partial_shape(i).rank().get_length() > 5) { - errorMessage = "Doesn't support 'range' input with rank: " + std::to_string(fq->get_input_partial_shape(i).rank().get_length()); + errorMessage = "Doesn't support 'range' input with rank: " + + std::to_string(fq->get_input_partial_shape(i).rank().get_length()); return false; } } @@ -935,7 +1032,7 @@ struct FakeQuantKey { seed = hash_combine(seed, jqp.wei_prc.hash()); seed = hash_combine(seed, jqp.dst_prc.hash()); seed = hash_combine(seed, jqp.op_type); - if (jqp.op_type == Algorithm::FQBinarization) { + if (jqp.op_type == Algorithm::FQBinarization) { seed = hash_combine(seed, jqp.c); } else { seed = hash_combine(seed, jqp.broadcasted); @@ -959,8 +1056,8 @@ struct FakeQuantKey { }; } // namespace -FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, PassThroughShapeInferFactory()) { +FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, PassThroughShapeInferFactory()) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { algorithm = Algorithm::FQCommon; @@ -1032,16 +1129,20 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte OPENVINO_THROW(errorPrefix, "has different quantization axis size on 'data' and 'range' inputs"); } - const auto inputLowNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(1)); + const auto inputLowNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(1)); auto inputLowData = inputLowNode->cast_vector(); - const auto inputHighNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(2)); + const auto inputHighNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(2)); auto inputHighData = inputHighNode->cast_vector(); - const auto outputLowNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(3)); + const auto outputLowNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(3)); auto outputLowData = outputLowNode->cast_vector(); - const auto outputHighNode = std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(4)); + const auto outputHighNode = + std::dynamic_pointer_cast(fq->get_input_node_shared_ptr(4)); auto outputHighData = outputHighNode->cast_vector(); binarization = levels == 2; @@ -1092,7 +1193,7 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte } } } else { - auto allElementsAreEqual = [&](const std::vector &data, size_t size) { + auto allElementsAreEqual = [&](const std::vector& data, size_t size) { if (size == 0) return true; @@ -1146,9 +1247,21 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] = outputScaleSize == 1; broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] = outputShiftSize == 1; - if (everyone_is(1u, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize)) + if (everyone_is(1u, + cropLowSize, + cropHighSize, + inputScaleSize, + inputShiftSize, + outputScaleSize, + outputShiftSize)) broadcastingPolicy = PerTensor; - else if (one_of(1u, cropLowSize, cropHighSize, inputScaleSize, inputShiftSize, outputScaleSize, outputShiftSize)) + else if (one_of(1u, + cropLowSize, + cropHighSize, + inputScaleSize, + inputShiftSize, + outputScaleSize, + outputShiftSize)) broadcastingPolicy = Mixed; else broadcastingPolicy = PerChannel; @@ -1224,7 +1337,10 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte bool isFakeQuantization = true; bool isFakeQuantizationWithScale = true; - for (size_t i = 0; i < std::max(inputLowAxisSize, std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); i++) { + for (size_t i = 0; + i < std::max(inputLowAxisSize, + std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); + i++) { float il = inputLowData[isInputLowBroadcasted ? 0 : i]; float ol = outputLowData[isOutputLowBroadcasted ? 0 : i]; float ih = inputHighData[isInputHighBroadcasted ? 0 : i]; @@ -1236,7 +1352,10 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte } if (isFakeQuantizationWithScale) { - for (size_t i = 0; i < std::max(inputLowAxisSize, std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); i++) { + for (size_t i = 0; + i < std::max(inputLowAxisSize, + std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); + i++) { float il = inputLowData[isInputLowBroadcasted ? 0 : i]; float ol = outputLowData[isOutputLowBroadcasted ? 0 : i]; float ih = inputHighData[isInputHighBroadcasted ? 0 : i]; @@ -1255,22 +1374,22 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const GraphConte std::vector FakeQuantize::getDataFormats() const { // Special case for first FQ in the network - const auto &dims = getInputShapeAtPort(0).getDims(); + const auto& dims = getInputShapeAtPort(0).getDims(); if (dims[getAxis()] == 3) { - return { LayoutType::ncsp }; + return {LayoutType::ncsp}; } else { if (isBinarization()) { - return { LayoutType::nspc }; + return {LayoutType::nspc}; } else { if (one_of(dims.size(), 4u, 5u)) { if (getAxis() == 1) { auto blkFormat = mayiuse(cpu::x64::avx512_core) ? LayoutType::nCsp16c : LayoutType::nCsp8c; - return { blkFormat, LayoutType::nspc, LayoutType::ncsp }; + return {blkFormat, LayoutType::nspc, LayoutType::ncsp}; } else { - return { LayoutType::ncsp }; + return {LayoutType::ncsp}; } } else { - return { LayoutType::ncsp }; + return {LayoutType::ncsp}; } } } @@ -1284,10 +1403,12 @@ void FakeQuantize::init() { inputPrecision = getOriginalInputPrecisionAtPort(0); outputPrecision = getOriginalOutputPrecisionAtPort(0); - if (inputPrecision != ov::element::f32 && inputPrecision != ov::element::u8 && inputPrecision != ov::element::i8) + if (inputPrecision != ov::element::f32 && inputPrecision != ov::element::u8 && + inputPrecision != ov::element::i8) inputPrecision = ov::element::f32; - if (outputPrecision != ov::element::f32 && outputPrecision != ov::element::u8 && outputPrecision != ov::element::i8) + if (outputPrecision != ov::element::f32 && outputPrecision != ov::element::u8 && + outputPrecision != ov::element::i8) outputPrecision = ov::element::f32; } } @@ -1381,7 +1502,8 @@ bool FakeQuantize::needPrepareParams() const { if (!selectedPrimitiveDescriptor) OPENVINO_THROW("CPU quantize node with name '", getName(), "' doesn't have primitive descriptors."); - if (internalBlobMemory.empty() || (selectedPrimitiveDescriptor->getImplementationType() != impl_desc_type::ref && inputShapesModified())) { + if (internalBlobMemory.empty() || + (selectedPrimitiveDescriptor->getImplementationType() != impl_desc_type::ref && inputShapesModified())) { return true; } @@ -1389,7 +1511,8 @@ bool FakeQuantize::needPrepareParams() const { const auto newPaddedSize = rnd_up(axisSize, 16); const auto currPaddedSize = rnd_up(currentAxisSize, 16); - return newPaddedSize != currPaddedSize || ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize); + return newPaddedSize != currPaddedSize || + ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize); } return false; } @@ -1401,26 +1524,33 @@ void FakeQuantize::prepareParams() { OPENVINO_ASSERT(newPaddedSize != 0); if (internalBlobMemory.empty() || newPaddedSize != rnd_up(currentAxisSize, 16) || - ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize)) { - DnnlBlockedMemoryDesc weightsDataDesc(Shape(VectorDims{newPaddedSize}), memory::data_type::f32, memory::format_tag::x); + ((isInputLowBroadcasted || isOutputHighBroadcasted) && axisSize != currentAxisSize)) { + DnnlBlockedMemoryDesc weightsDataDesc(Shape(VectorDims{newPaddedSize}), + memory::data_type::f32, + memory::format_tag::x); constexpr size_t numBinFqIntBlob = 2; bool needUpdThr = false, needUpdMask = false; if (isInputLowBroadcasted && axisSize != currentAxisSize) { binarizationThresholds.resize(newPaddedSize); - std::fill(binarizationThresholds.begin() + 1, binarizationThresholds.begin() + axisSize, binarizationThresholds[0]); + std::fill(binarizationThresholds.begin() + 1, + binarizationThresholds.begin() + axisSize, + binarizationThresholds[0]); std::fill(binarizationThresholds.begin() + axisSize, binarizationThresholds.end(), 0.f); needUpdThr = true; } if (isOutputHighBroadcasted && axisSize != currentAxisSize) { binarizationOutputMask.resize(newPaddedSize); - std::fill(binarizationOutputMask.begin() + 1, binarizationOutputMask.begin() + axisSize, binarizationOutputMask[0]); + std::fill(binarizationOutputMask.begin() + 1, + binarizationOutputMask.begin() + axisSize, + binarizationOutputMask[0]); std::fill(binarizationOutputMask.begin() + axisSize, binarizationOutputMask.end(), 0); needUpdMask = true; } if (internalBlobMemory.empty() || needUpdThr) { - auto binarizationThresholdsDataMem = std::make_shared(getEngine(), weightsDataDesc, getBinarizationTresholdsPtr()); + auto binarizationThresholdsDataMem = + std::make_shared(getEngine(), weightsDataDesc, getBinarizationTresholdsPtr()); if (internalBlobMemory.empty()) { internalBlobMemory.push_back(binarizationThresholdsDataMem); } else { @@ -1429,7 +1559,8 @@ void FakeQuantize::prepareParams() { } if (internalBlobMemory.size() == (numBinFqIntBlob - 1) || needUpdMask) { - auto binarizationMaskDataMem = std::make_shared(getEngine(), weightsDataDesc, getBinarizationOutputMaskPtr()); + auto binarizationMaskDataMem = + std::make_shared(getEngine(), weightsDataDesc, getBinarizationOutputMaskPtr()); if (internalBlobMemory.size() == (numBinFqIntBlob - 1)) { internalBlobMemory.push_back(binarizationMaskDataMem); } else { @@ -1449,31 +1580,39 @@ void FakeQuantize::createPrimitive() { if (selectedPrimitiveDescriptor->getImplementationType() != impl_desc_type::ref) { const auto& config = getSelectedPrimitiveDescriptor()->getConfig(); - //Form FakeQuanKey + // Form FakeQuanKey FakeQuantKey key = {}; key.jqp.src_prc = config.inConfs[0].getMemDesc()->getPrecision(); key.jqp.wei_prc = ov::element::f32; key.jqp.dst_prc = config.outConfs[0].getMemDesc()->getPrecision(); - const auto &srcMemory = getParentEdgeAt(0)->getMemory(); - const auto &srcDesc = srcMemory.getDesc(); + const auto& srcMemory = getParentEdgeAt(0)->getMemory(); + const auto& srcDesc = srcMemory.getDesc(); key.jqp.is_planar = srcDesc.hasLayoutType(LayoutType::ncsp) && one_of(srcDesc.getShape().getRank(), 3u, 4u, 5u); key.jqp.op_type = getAlgorithm(); if (isBinarization()) { - const auto &inDims = srcMemory.getStaticDims(); + const auto& inDims = srcMemory.getStaticDims(); key.jqp.c = inDims.size() > 1 ? inDims[1] : 1; } else { - // in case of blocked layout we need to extend vectors to prevent read from unallocated memory - size_t paddedSize = srcDesc.hasLayoutType(LayoutType::nCsp16c) ? 16 : srcDesc.hasLayoutType(LayoutType::nCsp8c) ? 8 : 1; + // in case of blocked layout we need to extend vectors to prevent read from unallocated memory + size_t paddedSize = srcDesc.hasLayoutType(LayoutType::nCsp16c) ? 16 + : srcDesc.hasLayoutType(LayoutType::nCsp8c) ? 8 + : 1; if (paddedSize != 1) { - if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) cropLow.resize(rnd_up(cropLow.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) cropHigh.resize(rnd_up(cropHigh.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) inputScale.resize(rnd_up(inputScale.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) inputShift.resize(rnd_up(inputShift.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) outputScale.resize(rnd_up(outputScale.size(), paddedSize)); - if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) outputShift.resize(rnd_up(outputShift.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_LOW)]) + cropLow.resize(rnd_up(cropLow.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)]) + cropHigh.resize(rnd_up(cropHigh.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)]) + inputScale.resize(rnd_up(inputScale.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)]) + inputShift.resize(rnd_up(inputShift.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)]) + outputScale.resize(rnd_up(outputScale.size(), paddedSize)); + if (!broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)]) + outputShift.resize(rnd_up(outputShift.size(), paddedSize)); } key.jqp.broadcasted = broadcasted; @@ -1530,11 +1669,10 @@ void FakeQuantize::executeReference() { parallel_nd(N, CB, D, H, W, [&](dim_t n, dim_t cb, dim_t d, dim_t h, dim_t w) { uint8_t bin_val = 0x00; for (int c = cb * nbits, shift = 0; c < std::min(static_cast(C), (cb + 1) * nbits); c++, shift++) { - size_t src_off = srcDims.size() == 4 ? - n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] : - srcDims.size() == 5 ? - n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] : - n * s_str[0] + c * s_str[1]; + size_t src_off = srcDims.size() == 4 ? n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] + : srcDims.size() == 5 + ? n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] + : n * s_str[0] + c * s_str[1]; float val = src[src_off]; float thr = thresholds[c]; @@ -1546,11 +1684,10 @@ void FakeQuantize::executeReference() { bin_val |= (bit << shift); } - size_t dst_off = dstDims.size() == 4 ? - n * d_str[0] + (cb * nbits) * d_str[1] + h * d_str[2] + w * d_str[3] : - dstDims.size() == 5 ? - n * d_str[0] + (cb * nbits) * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] : - n * d_str[0] + (cb * nbits) * d_str[1]; + size_t dst_off = dstDims.size() == 4 ? n * d_str[0] + (cb * nbits) * d_str[1] + h * d_str[2] + w * d_str[3] + : dstDims.size() == 5 + ? n * d_str[0] + (cb * nbits) * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] + : n * d_str[0] + (cb * nbits) * d_str[1]; dst[dst_off / nbits] = bin_val; }); @@ -1558,46 +1695,44 @@ void FakeQuantize::executeReference() { auto dst = dstMemory->getDataAs(); parallel_nd(N, C, D, H, W, [&](dim_t n, dim_t c, dim_t d, dim_t h, dim_t w) { - size_t src_off = srcDims.size() == 5 ? - n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] : - srcDims.size() == 4 ? - n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] : - srcDims.size() == 3 ? - n * s_str[0] + c * s_str[1] + h * s_str[2] : - srcDims.size() == 2 ? - n * s_str[0] + c * s_str[1] : - n * s_str[0]; + size_t src_off = srcDims.size() == 5 + ? n * s_str[0] + c * s_str[1] + d * s_str[2] + h * s_str[3] + w * s_str[4] + : srcDims.size() == 4 ? n * s_str[0] + c * s_str[1] + h * s_str[2] + w * s_str[3] + : srcDims.size() == 3 ? n * s_str[0] + c * s_str[1] + h * s_str[2] + : srcDims.size() == 2 ? n * s_str[0] + c * s_str[1] + : n * s_str[0]; float src_val = src[src_off]; int wei_idx = getAxis() == 0 ? n : c; float cl = broadcasted[static_cast(FQ_add_input_type::CROP_LOW)] ? cropLow[0] : cropLow[wei_idx]; float ch = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? cropHigh[0] : cropHigh[wei_idx]; - float isc = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? inputScale[0] : inputScale[wei_idx]; - float ish = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? inputShift[0] : inputShift[wei_idx]; - float osc = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? outputScale[0] : outputScale[wei_idx]; - float osh = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? outputShift[0] : outputShift[wei_idx]; + float isc = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? inputScale[0] : inputScale[wei_idx]; + float ish = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? inputShift[0] : inputShift[wei_idx]; + float osc = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? outputScale[0] + : outputScale[wei_idx]; + float osh = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? outputShift[0] + : outputShift[wei_idx]; float dst_val = nstl::min(ch, nstl::max(cl, src_val)); dst_val = dst_val * isc + ish; dst_val = roundf(dst_val); dst_val = dst_val * osc + osh; - size_t dst_off = dstDims.size() == 5 ? - n * d_str[0] + c * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] : - dstDims.size() == 4 ? - n * d_str[0] + c * d_str[1] + h * d_str[2] + w * d_str[3] : - dstDims.size() == 3 ? - n * d_str[0] + c * d_str[1] + h * d_str[2] : - dstDims.size() == 2 ? - n * d_str[0] + c * d_str[1] : - n * d_str[0]; + size_t dst_off = dstDims.size() == 5 + ? n * d_str[0] + c * d_str[1] + d * d_str[2] + h * d_str[3] + w * d_str[4] + : dstDims.size() == 4 ? n * d_str[0] + c * d_str[1] + h * d_str[2] + w * d_str[3] + : dstDims.size() == 3 ? n * d_str[0] + c * d_str[1] + h * d_str[2] + : dstDims.size() == 2 ? n * d_str[0] + c * d_str[1] + : n * d_str[0]; dst[dst_off] = dst_val; }); } } -void FakeQuantize::executeBinarization(const std::unique_ptr &pKernel) const { +void FakeQuantize::executeBinarization(const std::unique_ptr& pKernel) const { #if defined(OPENVINO_ARCH_X86_64) auto srcMemory = getSrcMemoryAtPort(0); auto dstMemory = getDstMemoryAtPort(0); @@ -1628,8 +1763,8 @@ void FakeQuantize::executeBinarization(const std::unique_ptr &pKernel) const { +void FakeQuantize::executeQuantization(const std::unique_ptr& pKernel) const { #if defined(OPENVINO_ARCH_X86_64) auto srcMemory = getSrcMemoryAtPort(0); auto dstMemory = getDstMemoryAtPort(0); @@ -1651,10 +1786,11 @@ void FakeQuantize::executeQuantization(const std::unique_ptrjqp_; + const auto& jqp = pKernel->jqp_; auto src_type_size = jqp.src_prc.size(); auto dst_type_size = jqp.dst_prc.size(); @@ -1691,15 +1827,20 @@ void FakeQuantize::executeQuantization(const std::unique_ptr(FQ_add_input_type::CROP_LOW)] ? &cropLow[0] : &cropLow[c]; - arg.crop_high = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; - arg.input_scale = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; - arg.input_shift = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; - arg.output_scale = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; - arg.output_shift = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; - - arg.src_step = (size_t) blk_size * src_type_size; - arg.dst_step = (size_t) blk_size * dst_type_size; - arg.block_size = (size_t) blk_size; + arg.crop_high = + broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; + arg.input_scale = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; + arg.input_shift = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; + arg.output_scale = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; + arg.output_shift = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; + + arg.src_step = (size_t)blk_size * src_type_size; + arg.dst_step = (size_t)blk_size * dst_type_size; + arg.block_size = (size_t)blk_size; arg.work_amount = (size_t)H; (*pKernel)(&arg); @@ -1714,22 +1855,27 @@ void FakeQuantize::executeQuantization(const std::unique_ptr(FQ_add_input_type::CROP_LOW)] ? &cropLow[0] : &cropLow[c]; - arg.crop_high = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; - arg.input_scale = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; - arg.input_shift = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; - arg.output_scale = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; - arg.output_shift = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; - - arg.src_step = is_blk_format ? (size_t) blk_size * src_type_size : (size_t) C * src_type_size; - arg.dst_step = is_blk_format ? (size_t) blk_size * dst_type_size : (size_t) C * dst_type_size; - arg.block_size = is_blk_format ? (size_t) blk_size : nstl::min(blk_size, C - c); + arg.crop_high = + broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; + arg.input_scale = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; + arg.input_shift = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; + arg.output_scale = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; + arg.output_shift = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; + + arg.src_step = is_blk_format ? (size_t)blk_size * src_type_size : (size_t)C * src_type_size; + arg.dst_step = is_blk_format ? (size_t)blk_size * dst_type_size : (size_t)C * dst_type_size; + arg.block_size = is_blk_format ? (size_t)blk_size : nstl::min(blk_size, C - c); arg.work_amount = (size_t)std::min(static_cast(batch_size), H * W - b * batch_size); (*pKernel)(&arg); @@ -1740,25 +1886,29 @@ void FakeQuantize::executeQuantization(const std::unique_ptr(FQ_add_input_type::CROP_LOW)] ? &cropLow[0] : &cropLow[c]; - arg.crop_high = broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; - arg.input_scale = broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; - arg.input_shift = broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; - arg.output_scale = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; - arg.output_shift = broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; - - arg.src_step = is_blk_format ? (size_t) blk_size * src_type_size : (size_t) C * src_type_size; - arg.dst_step = is_blk_format ? (size_t) blk_size * dst_type_size : (size_t) C * dst_type_size; - arg.block_size = (is_blk_format && srcDims.size() != 2) ? (size_t) blk_size : nstl::min(blk_size, C - c); - arg.work_amount = (size_t) W; + arg.crop_high = + broadcasted[static_cast(FQ_add_input_type::CROP_HIGH)] ? &cropHigh[0] : &cropHigh[c]; + arg.input_scale = + broadcasted[static_cast(FQ_add_input_type::INPUT_SCALE)] ? &inputScale[0] : &inputScale[c]; + arg.input_shift = + broadcasted[static_cast(FQ_add_input_type::INPUT_SHIFT)] ? &inputShift[0] : &inputShift[c]; + arg.output_scale = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SCALE)] ? &outputScale[0] : &outputScale[c]; + arg.output_shift = + broadcasted[static_cast(FQ_add_input_type::OUTPUT_SHIFT)] ? &outputShift[0] : &outputShift[c]; + + arg.src_step = is_blk_format ? (size_t)blk_size * src_type_size : (size_t)C * src_type_size; + arg.dst_step = is_blk_format ? (size_t)blk_size * dst_type_size : (size_t)C * dst_type_size; + arg.block_size = (is_blk_format && srcDims.size() != 2) ? (size_t)blk_size : nstl::min(blk_size, C - c); + arg.work_amount = (size_t)W; (*pKernel)(&arg); }); @@ -1778,7 +1928,7 @@ void FakeQuantize::execute(dnnl::stream strm) { } } -void FakeQuantize::initializePostOpData(const VectorDims &dims, const size_t bufferAlignment, bool doRounding) { +void FakeQuantize::initializePostOpData(const VectorDims& dims, const size_t bufferAlignment, bool doRounding) { if (postOpDataVersion == parameterVersion) return; @@ -1789,11 +1939,15 @@ void FakeQuantize::initializePostOpData(const VectorDims &dims, const size_t buf binarizationOutputMask.resize(axisPaddedSize, 0); if (isInputLowBroadcasted) { - std::fill(binarizationThresholds.begin() + 1, binarizationThresholds.begin() + realAxisSize, binarizationThresholds[0]); + std::fill(binarizationThresholds.begin() + 1, + binarizationThresholds.begin() + realAxisSize, + binarizationThresholds[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } if (isOutputHighBroadcasted) { - std::fill(binarizationOutputMask.begin() + 1, binarizationOutputMask.begin() + realAxisSize, binarizationOutputMask[0]); + std::fill(binarizationOutputMask.begin() + 1, + binarizationOutputMask.begin() + realAxisSize, + binarizationOutputMask[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } } else { @@ -1803,7 +1957,7 @@ void FakeQuantize::initializePostOpData(const VectorDims &dims, const size_t buf postOpDataVersion = parameterVersion; } -void FakeQuantize::initializePostOpDataLegacy(const VectorDims &dims, const size_t bufferAlignment) { +void FakeQuantize::initializePostOpDataLegacy(const VectorDims& dims, const size_t bufferAlignment) { if (legacyPostOpDataVersion == parameterVersion) return; @@ -1815,11 +1969,15 @@ void FakeQuantize::initializePostOpDataLegacy(const VectorDims &dims, const size binarizationOutputMask.resize(axisPaddedSize, 0); if (isInputLowBroadcasted) { - std::fill(binarizationThresholds.begin() + 1, binarizationThresholds.begin() + realAxisSize, binarizationThresholds[0]); + std::fill(binarizationThresholds.begin() + 1, + binarizationThresholds.begin() + realAxisSize, + binarizationThresholds[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } if (isOutputHighBroadcasted) { - std::fill(binarizationOutputMask.begin() + 1, binarizationOutputMask.begin() + realAxisSize, binarizationOutputMask[0]); + std::fill(binarizationOutputMask.begin() + 1, + binarizationOutputMask.begin() + realAxisSize, + binarizationOutputMask[0]); std::fill(binarizationThresholds.begin() + realAxisSize, binarizationThresholds.end(), 0.f); } @@ -1839,7 +1997,10 @@ void FakeQuantize::initializePostOpDataLegacy(const VectorDims &dims, const size legacyPostOpDataVersion = parameterVersion; } -void FakeQuantize::appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void FakeQuantize::appendMemory(const size_t dataSize, + const void* data, + MemoryPtr& memPtr, + std::vector& postOpsMem) { if (!memPtr) { DnnlBlockedMemoryDesc memoryDesc(ov::element::f32, {dataSize}); memPtr = std::make_shared(getEngine(), memoryDesc, data); @@ -1848,12 +2009,15 @@ void FakeQuantize::appendMemory(const size_t dataSize, const void *data, MemoryP } } -void FakeQuantize::appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem) { +void FakeQuantize::appendMemory(const size_t dataSize, + const void* data, + MemoryPtr& memPtr, + std::vector& postOpsMem) { postOpsMem.push_back(data); } template -void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem) { +void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims& postOpDims, std::vector& postOpsMem) { // try to map fakeQuantizeNode using output scale & eltwise first // if failed, fallback to append_quantization() @@ -1865,21 +2029,40 @@ void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &post initializePostOpDataLegacy(postOpDims, bufferAlignment); if (getAlgorithm() == Algorithm::FQBinarization) { - ops.append_binarization(dnnl::algorithm::binarization_depthwise, (const float*)&binarizationThresholds[0], (const float*)&binarizationOutputMask[0]); + ops.append_binarization(dnnl::algorithm::binarization_depthwise, + (const float*)&binarizationThresholds[0], + (const float*)&binarizationOutputMask[0]); } else { - dnnl::algorithm alg = getAlgorithm() == Algorithm::FQQuantization ? dnnl::algorithm::quantization_quantize : - dnnl::algorithm::quantization_quantize_dequantize; + dnnl::algorithm alg = getAlgorithm() == Algorithm::FQQuantization + ? dnnl::algorithm::quantization_quantize + : dnnl::algorithm::quantization_quantize_dequantize; - std::array per_channel = {cropLowSize > 1, cropHighSize > 1, inputScaleSize > 1, - inputShiftSize > 1, outputScaleSize > 1, outputShiftSize > 1}; + std::array per_channel = {cropLowSize > 1, + cropHighSize > 1, + inputScaleSize > 1, + inputShiftSize > 1, + outputScaleSize > 1, + outputShiftSize > 1}; std::array all_default = {false}; - all_default[0] = std::all_of(cropLow.cbegin(), cropLow.cend(), [](float val){ return val == 0.f; }); - all_default[1] = std::all_of(cropHigh.cbegin(), cropHigh.cend(), [](float val){ return val == 0.f; }); - all_default[2] = std::all_of(inputScale.cbegin(), inputScale.cend(), [](float val){ return val == 1.f; }); - all_default[3] = std::all_of(inputShift.cbegin(), inputShift.cend(), [](float val){ return val == 0.f; }); - all_default[4] = std::all_of(outputScale.cbegin(), outputScale.cend(), [](float val){ return val == 1.f; }); - all_default[5] = std::all_of(outputShift.cbegin(), outputShift.cend(), [](float val){ return val == 0.f; }); + all_default[0] = std::all_of(cropLow.cbegin(), cropLow.cend(), [](float val) { + return val == 0.f; + }); + all_default[1] = std::all_of(cropHigh.cbegin(), cropHigh.cend(), [](float val) { + return val == 0.f; + }); + all_default[2] = std::all_of(inputScale.cbegin(), inputScale.cend(), [](float val) { + return val == 1.f; + }); + all_default[3] = std::all_of(inputShift.cbegin(), inputShift.cend(), [](float val) { + return val == 0.f; + }); + all_default[4] = std::all_of(outputScale.cbegin(), outputScale.cend(), [](float val) { + return val == 1.f; + }); + all_default[5] = std::all_of(outputShift.cbegin(), outputShift.cend(), [](float val) { + return val == 0.f; + }); std::array offsets = {0}; offsets[1] = offsets[0] + cropLowSize; @@ -1894,7 +2077,9 @@ void FakeQuantize::appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &post } } -void FakeQuantize::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, +void FakeQuantize::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, const int channelAxis) { std::vector postOpsMemPtrs; appendPostOpsImpl(ops, postOpDims, postOpsMemPtrs); @@ -1906,7 +2091,9 @@ void FakeQuantize::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDi } } -void FakeQuantize::appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, +void FakeQuantize::appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, const int channelAxis) { appendPostOpsImpl(ops, postOpDims, postOpsMem); } @@ -1957,7 +2144,7 @@ void FakeQuantize::updateOptimizedFormula(bool do_rounding) { // per-channel FQ. if (isPerTensor(inputShift, inputShift[0], 0.00005f)) { f.ish.resize(OC); - for (auto & v : f.ish) + for (auto& v : f.ish) v = inputShift[0]; } else { f.ish = inputShift; @@ -2115,7 +2302,7 @@ bool FakeQuantize::appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, return true; } -FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) { +FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params& _jqp) { #if defined(OPENVINO_ARCH_X86_64) bool isBinarization = _jqp.op_type == Algorithm::FQBinarization; if (mayiuse(cpu::x64::avx512_core)) { @@ -2157,6 +2344,6 @@ bool FakeQuantize::created() const { return getType() == Type::FakeQuantize; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.h b/src/plugins/intel_cpu/src/nodes/fake_quantize.h index 62aea6092451a6..af34c0b91a1a7a 100644 --- a/src/plugins/intel_cpu/src/nodes/fake_quantize.h +++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.h @@ -4,25 +4,17 @@ #pragma once -#include "common/primitive_attr.hpp" -#include "node.h" - #include + +#include "common/primitive_attr.hpp" #include "dnnl_postops_composer_legacy.h" +#include "node.h" namespace ov { namespace intel_cpu { namespace node { -enum class FQ_add_input_type { - CROP_LOW, - CROP_HIGH, - INPUT_SCALE, - INPUT_SHIFT, - OUTPUT_SCALE, - OUTPUT_SHIFT, - INPUTS_SIZE -}; +enum class FQ_add_input_type { CROP_LOW, CROP_HIGH, INPUT_SCALE, INPUT_SHIFT, OUTPUT_SCALE, OUTPUT_SHIFT, INPUTS_SIZE }; struct jit_quantize_params { bool is_planar; @@ -33,8 +25,8 @@ struct jit_quantize_params { Algorithm op_type; - int c; // need only for binarization - std::bitset(FQ_add_input_type::INPUTS_SIZE)> broadcasted; // need only for quantization + int c; // need only for binarization + std::bitset(FQ_add_input_type::INPUTS_SIZE)> broadcasted; // need only for quantization }; struct jit_quantize_call_args { @@ -57,9 +49,9 @@ struct jit_quantize_call_args { }; struct jit_uni_quantize_kernel { - void (*ker_)(const jit_quantize_call_args *); + void (*ker_)(const jit_quantize_call_args*); - void operator()(const jit_quantize_call_args *args) { + void operator()(const jit_quantize_call_args* args) { assert(ker_); ker_(args); } @@ -82,58 +74,116 @@ class FakeQuantize : public Node { void execute(dnnl::stream strm) override; void executeDynamicImpl(dnnl::stream strm) override; - size_t getAxis() const { return axis; } + size_t getAxis() const { + return axis; + } - bool isBinarization() const { return getAlgorithm() == Algorithm::FQBinarization; } + bool isBinarization() const { + return getAlgorithm() == Algorithm::FQBinarization; + } bool needPrepareParams() const override; void prepareParams() override; void createPrimitive() override; - const float* getBinarizationTresholdsPtr() const { return &binarizationThresholds[0]; } - const float* getBinarizationOutputMaskPtr() const { return reinterpret_cast(&binarizationOutputMask[0]); } - size_t getBinarizationTresholdsSize() const { return binarizationThresholds.size(); } - size_t getBinarizationOutputMaskSize() const { return binarizationOutputMask.size(); } + const float* getBinarizationTresholdsPtr() const { + return &binarizationThresholds[0]; + } + const float* getBinarizationOutputMaskPtr() const { + return reinterpret_cast(&binarizationOutputMask[0]); + } + size_t getBinarizationTresholdsSize() const { + return binarizationThresholds.size(); + } + size_t getBinarizationOutputMaskSize() const { + return binarizationOutputMask.size(); + } - const std::vector& getCropLow() const { return cropLow; } - const std::vector& getCropHigh() const { return cropHigh; } - const std::vector& getInputScale() const { return inputScale; } - const std::vector& getInputShift() const { return inputShift; } - const std::vector& getOutputScale() const { return outputScale; } - const std::vector& getOutputShift() const { return outputShift; } - const size_t getLevels() const { return levels; } + const std::vector& getCropLow() const { + return cropLow; + } + const std::vector& getCropHigh() const { + return cropHigh; + } + const std::vector& getInputScale() const { + return inputScale; + } + const std::vector& getInputShift() const { + return inputShift; + } + const std::vector& getOutputScale() const { + return outputScale; + } + const std::vector& getOutputShift() const { + return outputShift; + } + const size_t getLevels() const { + return levels; + } void setCropLow(std::vector newCropLow) { - cropLow = std::move(newCropLow); cropLowSize = cropLow.size(); ++parameterVersion; + cropLow = std::move(newCropLow); + cropLowSize = cropLow.size(); + ++parameterVersion; } void setCropHigh(std::vector newCropHigh) { - cropHigh = std::move(newCropHigh); cropHighSize = cropHigh.size(); ++parameterVersion; + cropHigh = std::move(newCropHigh); + cropHighSize = cropHigh.size(); + ++parameterVersion; } void setInputScale(std::vector newInputScale) { - inputScale = std::move(newInputScale); inputScaleSize = inputScale.size(); ++parameterVersion; + inputScale = std::move(newInputScale); + inputScaleSize = inputScale.size(); + ++parameterVersion; } void setInputShift(std::vector newInputShift) { - inputShift = std::move(newInputShift); inputShiftSize = inputShift.size(); ++parameterVersion; + inputShift = std::move(newInputShift); + inputShiftSize = inputShift.size(); + ++parameterVersion; } void setOutputScale(std::vector newOutputScale) { - outputScale = std::move(newOutputScale); outputScaleSize = outputScale.size(); ++parameterVersion; + outputScale = std::move(newOutputScale); + outputScaleSize = outputScale.size(); + ++parameterVersion; } void setOutputShift(std::vector newOutputShift) { - outputShift = std::move(newOutputShift); outputShiftSize = outputShift.size(); ++parameterVersion; + outputShift = std::move(newOutputShift); + outputShiftSize = outputShift.size(); + ++parameterVersion; } - const std::vector& getFQScales() const { return fqScales; } + const std::vector& getFQScales() const { + return fqScales; + } - bool isInputLowBroadcast() const { return isInputLowBroadcasted; } - bool isInputHighBroadcast() const { return isInputHighBroadcasted; } - bool isOutputLowBroadcast() const { return isOutputLowBroadcasted; } - bool isOutputHighBroadcast() const { return isOutputHighBroadcasted; } + bool isInputLowBroadcast() const { + return isInputLowBroadcasted; + } + bool isInputHighBroadcast() const { + return isInputHighBroadcasted; + } + bool isOutputLowBroadcast() const { + return isOutputLowBroadcasted; + } + bool isOutputHighBroadcast() const { + return isOutputHighBroadcasted; + } - ov::element::Type getInputPrecision() const { return inputPrecision; } - ov::element::Type getOutputPrecision() const { return outputPrecision; } + ov::element::Type getInputPrecision() const { + return inputPrecision; + } + ov::element::Type getOutputPrecision() const { + return outputPrecision; + } - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::unordered_map& postOpsMem, const int channelAxis = 1) override; - void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem, const int channelAxis = 1) override; + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::unordered_map& postOpsMem, + const int channelAxis = 1) override; + void appendPostOps(dnnl::post_ops& ops, + const VectorDims& postOpDims, + std::vector& postOpsMem, + const int channelAxis = 1) override; bool appendAttrPostOps(DnnlPostOpsComposerLegacy& dnnlpoc, bool isLastPostOp, dnnl::memory::data_type outDataType, @@ -143,12 +193,14 @@ class FakeQuantize : public Node { static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; enum BroadcastingPolicy { - PerChannel, // all FQ operations are per channel - PerTensor, // all FQ operations are per tensor - Mixed, // some per channel, some per tensor + PerChannel, // all FQ operations are per channel + PerTensor, // all FQ operations are per tensor + Mixed, // some per channel, some per tensor }; - BroadcastingPolicy getBroadcastingPolicy() const { return broadcastingPolicy; } + BroadcastingPolicy getBroadcastingPolicy() const { + return broadcastingPolicy; + } MemoryPtr cropLowMemory; MemoryPtr cropHighMemory; @@ -165,22 +217,22 @@ class FakeQuantize : public Node { using executorPtr = std::shared_ptr; executorPtr execPtr = nullptr; struct FakeQuantizeJitExecutor : public FakeQuantizeExecutor { - FakeQuantizeJitExecutor(const jit_quantize_params &_jqp); + FakeQuantizeJitExecutor(const jit_quantize_params& _jqp); void exec(const FakeQuantize& node) override; std::unique_ptr pKernel; }; void init() override; std::vector getDataFormats() const; - void initializePostOpData(const VectorDims &postOpDims, const size_t bufferAlignment, bool doRounding); - void initializePostOpDataLegacy(const VectorDims &dims, const size_t bufferAlignment); + void initializePostOpData(const VectorDims& postOpDims, const size_t bufferAlignment, bool doRounding); + void initializePostOpDataLegacy(const VectorDims& dims, const size_t bufferAlignment); void executeReference(); - void executeBinarization(const std::unique_ptr &pKernel) const; - void executeQuantization(const std::unique_ptr &pKernel) const; + void executeBinarization(const std::unique_ptr& pKernel) const; + void executeQuantization(const std::unique_ptr& pKernel) const; - void appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem); - void appendMemory(const size_t dataSize, const void *data, MemoryPtr &memPtr, std::vector& postOpsMem); + void appendMemory(const size_t dataSize, const void* data, MemoryPtr& memPtr, std::vector& postOpsMem); + void appendMemory(const size_t dataSize, const void* data, MemoryPtr& memPtr, std::vector& postOpsMem); template - void appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector& postOpsMem); + void appendPostOpsImpl(dnnl::post_ops& ops, const VectorDims& postOpDims, std::vector& postOpsMem); size_t levels = 0; @@ -273,6 +325,6 @@ class FakeQuantize : public Node { BroadcastingPolicy broadcastingPolicy; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 7f6ed99b1173d7..2df6c0ae7522cc 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -10,8 +10,10 @@ #include "common/cpu_convert.h" #include "common/cpu_memcpy.h" +#include "cpu_types.h" #include "dnnl_extension_utils.h" #include "executors/memory_arguments.hpp" +#include "fake_quantize.h" #include "graph_context.h" #include "input.h" #include "memory_desc/blocked_memory_desc.h" @@ -19,16 +21,19 @@ #include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/executors/executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/threading/cpu_message.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_compressed.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/fully_connected_quantized_legacy.hpp" #include "post_ops.hpp" #include "shape_inference/custom/fullyconnected.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "transformations/utils/utils.hpp" #include "utils/debug_capabilities.h" #include "utils/general_utils.h" -#include "fake_quantize.h" - using namespace dnnl; using namespace ov::element; @@ -39,25 +44,77 @@ namespace node { bool FullyConnected::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto fc = std::dynamic_pointer_cast(op); - if (!fc) { - errorMessage = "Only legacy FullyConnected operation is supported"; + if (!ov::is_type(op) && + !ov::is_type(op) && + !ov::is_type(op)) { return false; } - if (fc->get_input_size() == 3 && - std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(BIAS_ID)) == nullptr) { - errorMessage = "Only Constant operation on 'bias' input is supported"; + + if (ov::is_type(op)) { + if (!ov::op::util::is_on_constant_path(op->input_value(BIAS))) { + errorMessage = "Only Constant operation on 'bias' input is supported"; + return false; + } + } + + if (ov::is_type(op)) { + if (!ov::op::util::is_on_constant_path(op->input_value(WEIGHT_SCALES)) || + !ov::op::util::is_on_constant_path(op->input_value(WEIGHT_ZERO_POINTS))) { + errorMessage = + "Only Constant operation on 'weight scales', and 'weight zero points' inputs is supported"; + return false; + } + } + } catch (...) { + return false; + } + + return true; +} + +// @todo replace 'inferencePrecision' check with 'fc->get_input_element_type(0) == ov::element::bf16' +// after bf16 pipeline is moved to ConvertPrecision +bool FullyConnected::isSupportedCompressedOperation(const std::shared_ptr& op, + size_t IC, + size_t OC, + size_t G, + ov::element::Type inferencePrecision) noexcept { +#if defined(OPENVINO_ARCH_X86_64) + try { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) return false; + + if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) + return false; + + if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && + inferencePrecision == ov::element::bf16) { + // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a + // current solution conditions below are copied from OneDNN to make sure correct IP impl will be + // used since fallback one doesn't support weights decompression feature. + size_t simdWidth = 16; + size_t vnniFactor = 2; + size_t maxSize = 512; + auto amxRow = vnniFactor * simdWidth; + + if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) { + return false; + } } - const auto weightRank = fc->get_input_partial_shape(WEIGHTS_ID).size(); - if (weightRank != 2) { - errorMessage = "Doesn't support 'weight' input with rank: " + std::to_string(weightRank); + + if (IC % G != 0 || IC / G < 4 || OC == 1) { return false; } + + return true; } catch (...) { return false; } return true; +#else + return false; +#endif } void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context) { @@ -66,7 +123,7 @@ void FullyConnected::initTensorParallelConfig(const GraphContext::CPtr context) // init tp_cfg.w_rank and tp_cfg.w_size tp_cfg.w_rank = context->getCPUStreamExecutor()->get_rank()[0]; tp_cfg.w_size = ov::threading::message_manager()->get_num_sub_streams(); - tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1 ? true : false; + tp_cfg.enable_tensor_parallel = tp_cfg.w_size > 1; tp_cfg.sub_memory = context->getSubMemory(); } } @@ -79,6 +136,30 @@ FullyConnected::FullyConnected(const std::shared_ptr& op, const GraphC initTensorParallelConfig(context); if (!isSupportedOperation(op, errorMessage)) OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + + m_atoi[ARG_SRC] = DATA; + m_atoi[ARG_WEI] = WEIGHTS; + m_atoi[ARG_BIAS] = BIAS; + + auto mapArgToInput = [&op](std::unordered_map& argToInput, size_t argId, size_t inputId) { + if (op->get_input_size() > inputId && op->input(inputId).get_element_type() != ov::element::undefined) { + argToInput[argId] = inputId; + } + }; + + if (ov::is_type(op)) { + mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_SCALES, WEIGHT_SCALES); + mapArgToInput(m_atoi, ARG_WEI | ARG_ATTR_ZERO_POINTS, WEIGHT_ZERO_POINTS); + algorithm = Algorithm::FullyConnectedCompressed; + } else if (ov::is_type(op)) { + mapArgToInput(m_atoi, ARG_DST_DEQ_SCALE, 3); + algorithm = Algorithm::FullyConnectedQuantizedLegacy; + } else if (ov::is_type(op)) { + algorithm = Algorithm::FullyConnectedQuantized; + OPENVINO_THROW_NOT_IMPLEMENTED("FullyConnectedQuantized is not implemented yet"); + } else { + algorithm = Algorithm::FullyConnectedCommon; + } } bool FullyConnected::canBeExecutedInInt8() const { @@ -108,7 +189,8 @@ void FullyConnected::needPrepareParamsForTensorParallel() { dim += dims.size(); } OPENVINO_ASSERT(static_cast(dims[dim]) >= tp_cfg.w_size, - getName() + " dim[" + std::to_string(dim) + "] is " + std::to_string(dims[dim]) + ", which is larger than w_size " + std::to_string(tp_cfg.w_size)); + getName() + " dim[" + std::to_string(dim) + "] is " + std::to_string(dims[dim]) + + ", which is larger than w_size " + std::to_string(tp_cfg.w_size)); auto splited_dim_vec = split_parts(dims[dim], tp_cfg.w_size); VectorDims new_dims = std::move(dims); @@ -119,22 +201,18 @@ void FullyConnected::needPrepareParamsForTensorParallel() { } } -ExecutorPtr FullyConnected::createExecutor() { - const auto& executor = factory->make(memory); - getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType()); - - return executor; -} - void FullyConnected::prepareParams() { needPrepareParamsForTensorParallel(); - executor = createExecutor(); + + executor->update(memory); + // @todo avoid updating implementation type in scope of every prepareParams call + getSelectedPrimitiveDescriptor()->setImplementationType(executor->implType()); } void FullyConnected::initTensorParallelSync() { if (tp_cfg.enable_tensor_parallel) { tp_cfg.id = tp_cfg.sub_memory->get_memory_id(tp_cfg.w_rank); - OPENVINO_ASSERT(tp_cfg.id > 0, "Tensor Parallel Config ID cannot be negative."); + OPENVINO_ASSERT(tp_cfg.id >= 0, "Tensor Parallel Config ID cannot be negative."); tp_cfg.sub_memory->set_memory_used(tp_cfg.id, tp_cfg.w_rank); while (true) { std::lock_guard lock(tp_cfg.sub_memory->_flagMutex); @@ -191,18 +269,34 @@ void FullyConnected::execTensorParallelSync() { for (int idx = 0; idx < tp_cfg.w_size; idx++) { if (wait_list[idx] > 0 && tp_cfg.sub_memory->_memorys_table[tp_cfg.id][idx].flag) { auto new_ptr = static_cast(tp_cfg.sub_memory->_memorys_table[tp_cfg.id][idx].send_buf); - const auto copySize = splited_dim_vec[idx] * prec.size(); // bytes of half selected dim. + const auto copySize = splited_dim_vec[idx] * prec.size(); // bytes of half selected dim. const size_t unloop = 8; size_t step = count / unloop; - parallel_for(step, [&](size_t i){ - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop) * channel_size, new_ptr + (i * unloop) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 1) * channel_size, new_ptr + (i * unloop + 1) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 2) * channel_size, new_ptr + (i * unloop + 2) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 3) * channel_size, new_ptr + (i * unloop + 3) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 4) * channel_size, new_ptr + (i * unloop + 4) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 5) * channel_size, new_ptr + (i * unloop + 5) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 6) * channel_size, new_ptr + (i * unloop + 6) * copySize, copySize); - cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 7) * channel_size, new_ptr + (i * unloop + 7) * copySize, copySize); + parallel_for(step, [&](size_t i) { + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop) * channel_size, + new_ptr + (i * unloop) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 1) * channel_size, + new_ptr + (i * unloop + 1) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 2) * channel_size, + new_ptr + (i * unloop + 2) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 3) * channel_size, + new_ptr + (i * unloop + 3) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 4) * channel_size, + new_ptr + (i * unloop + 4) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 5) * channel_size, + new_ptr + (i * unloop + 5) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 6) * channel_size, + new_ptr + (i * unloop + 6) * copySize, + copySize); + cpu_memcpy(dst_ptr + idx * strideSize + (i * unloop + 7) * channel_size, + new_ptr + (i * unloop + 7) * copySize, + copySize); }); size_t tail = count & ~(unloop - 1); for (size_t i = tail; i < count; ++i) { @@ -224,6 +318,7 @@ void FullyConnected::execTensorParallelSync() { } } } + void FullyConnected::execute(dnnl::stream strm) { initTensorParallelSync(); @@ -370,31 +465,11 @@ static bool useSparseWeightsDecompression(const NodePtr& weightsInput, return sparseRate >= minSparseRate; } -void FullyConnected::needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales) { - if (tp_cfg.enable_tensor_parallel) { - auto split_parts = [](int len, int n) { - int average = len / n; - std::vector parts(n, average); - parts.back() = len - average * (n - 1); - return parts; - }; - auto DQScales = getDQScales(); - auto split_lens = split_parts(DQScales.size(), tp_cfg.w_size); - auto split_offset = tp_cfg.w_rank * split_lens[0]; - std::vector newDQScales(split_lens[tp_cfg.w_rank]); - std::copy(DQScales.begin() + split_offset, DQScales.begin() + split_offset + split_lens[tp_cfg.w_rank], newDQScales.begin()); - dequantizationScales = std::move(newDQScales); - } -} - void FullyConnected::initSupportedPrimitiveDescriptors() { - attrs.withBias = getOriginalInputsNumber() == 3; - - attrs.dequantizationScales = getDQScales(); - needUpdateDQScaleForTensorParallel(attrs.dequantizationScales); + attrs.withBias = getOriginalInputPrecisionAtPort(BIAS) != ov::element::undefined; - attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS_ID)->getParent(), - getOriginalInputPrecisionAtPort(DATA_ID), + attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS)->getParent(), + getOriginalInputPrecisionAtPort(DATA), context->getConfig().fcSparseWeiDecompressionRate); attrs.dynamicQuantizationGroupSize = context->getConfig().fcDynamicQuantizationGroupSize; attrs.modelType = context->getConfig().modelType; @@ -410,6 +485,10 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { VecMemoryDescs srcDescs; const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); for (size_t i = 0; i < srcTypes.size(); i++) { + if (srcTypes[i] == element::undefined) { + srcDescs.push_back(MemoryDescUtils::makeEmptyDesc()); + continue; + } const auto srcDesc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(srcTypes[i], getInputShapeAtPort(i)); srcDescs.push_back(srcDesc); } @@ -421,23 +500,31 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { } MemoryDescArgs descs{ - {ARG_SRC, srcDescs[0]}, - {ARG_WEI, srcDescs[1]}, - {ARG_BIAS, attrs.withBias ? srcDescs[2] : MemoryDescUtils::makeEmptyDesc()}, + {ARG_SRC, srcDescs[DATA]}, + {ARG_WEI, srcDescs[WEIGHTS]}, + {ARG_BIAS, srcDescs[BIAS]}, {ARG_DST, dstDescs[0]}, }; - needUpdateScaleForTensorParallel(); - needUpdateZeroPointForTensorParallel(); - auto executionContext = std::make_shared(context, getImplPriority(), privateWeightCache); - factory = std::make_shared>(attrs, postOps, executionContext, descs); + factory = std::make_shared>(attrs, postOps, executionContext, descs); const auto nodeDescriptors = factory->getProperMemoryDescriptors(descs); NodeConfig nodeConfig; - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_SRC)); - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_WEI)); - if (attrs.withBias) nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_BIAS)); + nodeConfig.inConfs.resize(srcDescs.size()); + + for (const auto& desc : nodeDescriptors) { + if (m_atoi.count(desc.first)) { + nodeConfig.inConfs[m_atoi[desc.first]] = desc.second; + } + } + + // add extra inputs bypassing proper memory descriptors + // @todo pass all the input descriptors to getProperMemoryDescriptors and allow + // to ignore extra input descriptors if necessery + for (size_t i = 3; i < srcDescs.size(); i++) { + nodeConfig.inConfs[i] = srcDescs[i]; + } const int inPlace = canBeInPlace() ? 0 : -1; nodeConfig.outConfs.emplace_back(nodeDescriptors.at(ARG_DST), BlockedMemoryDesc::FULL_MASK, inPlace); @@ -447,19 +534,21 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { void FullyConnected::needSplitMemoryForTensorParallel() { if (tp_cfg.enable_tensor_parallel) { - auto src = getSrcMemoryAtPort(DATA_ID); - auto wgt = getSrcMemoryAtPort(WEIGHTS_ID); + auto src = getSrcMemoryAtPort(DATA); + auto wgt = getSrcMemoryAtPort(WEIGHTS); auto dst = getDstMemoryAtPort(0); // src - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); + memory[ARG_SRC] = getSrcMemoryAtPort(DATA); // wgt // split N direction - tp_cfg.cached_splited_weight = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size); + tp_cfg.cached_splited_weight = + attrs.weightsNonTransposed + ? split_vertical(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), std::move(wgt), 0, tp_cfg.w_rank, tp_cfg.w_size); memory[ARG_WEI] = tp_cfg.cached_splited_weight; // bias if (attrs.withBias) { - auto bias = getSrcMemoryAtPort(BIAS_ID); + auto bias = getSrcMemoryAtPort(BIAS); auto select_bias = split_horizontal(context->getEngine(), std::move(bias), 0, tp_cfg.w_rank, tp_cfg.w_size); tp_cfg.cached_splited_bias = std::move(select_bias); } else { @@ -468,7 +557,28 @@ void FullyConnected::needSplitMemoryForTensorParallel() { memory[ARG_BIAS] = tp_cfg.cached_splited_bias; // dst memory[ARG_DST] = getDstMemoryAtPort(0); - tp_cfg.cached_dst = split_horizontal(context->getEngine(), std::move(dst), -1, tp_cfg.w_rank, tp_cfg.w_size, false); + tp_cfg.cached_dst = + split_horizontal(context->getEngine(), std::move(dst), -1, tp_cfg.w_rank, tp_cfg.w_size, false); + + memory[ARG_DST | ARG_ATTR_SCALES] = + split_horizontal(context->getEngine(), memory[ARG_DST | ARG_ATTR_SCALES], 0, tp_cfg.w_rank, tp_cfg.w_size); + + auto scale_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_SCALES]); + memory[ARG_WEI | ARG_ATTR_SCALES] = + attrs.weightsNonTransposed + ? split_vertical(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + + auto zeropoint_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_ZERO_POINTS]); + auto element_num = zeropoint_mem->getSize() / zeropoint_mem->getPrecision().size(); + if (element_num == 1) { + tp_cfg.cached_zeropoint = zeropoint_mem; + } else { + tp_cfg.cached_zeropoint = + attrs.weightsNonTransposed + ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + } } } @@ -477,7 +587,7 @@ void FullyConnected::needUpdateTensorParalelConfig() { // 1. weight shape is dynamic // 2. last dim can be splited. if (tp_cfg.enable_tensor_parallel) { - auto& shape = getSrcMemoryAtPort(WEIGHTS_ID)->getShape(); + auto& shape = getSrcMemoryAtPort(WEIGHTS)->getShape(); if (shape.isDynamic()) { tp_cfg.enable_tensor_parallel = false; } else if (shape.getDims()[0] < static_cast(tp_cfg.w_size)) { @@ -485,18 +595,22 @@ void FullyConnected::needUpdateTensorParalelConfig() { } } } + void FullyConnected::createPrimitive() { needUpdateTensorParalelConfig(); - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); - memory[ARG_WEI] = getSrcMemoryAtPort(WEIGHTS_ID); - memory[ARG_BIAS] = attrs.withBias ? getSrcMemoryAtPort(BIAS_ID) : MemoryDescUtils::makeEmptyMemory(context); + for (const auto& entry : m_atoi) { + const auto argumentId = entry.first; + const auto inputId = entry.second; + memory[argumentId] = getSrcMemoryAtPort(inputId); + } + memory[ARG_DST] = getDstMemoryAtPort(0); needSplitMemoryForTensorParallel(); // @todo should we preconfigure only for dynamic shapes? // Since for static shapes primitive is created in scope of compile_model() anyway - factory->preconfigure(memory); + executor = factory->make(memory); Node::createPrimitive(); } @@ -517,49 +631,6 @@ ov::element::Type FullyConnected::getRuntimePrecision() const { return getMaxPrecision(srcTypes); } -void FullyConnected::needUpdateScaleForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_scale) { - attrs.decompressionMultiplyPtr = tp_cfg.cached_scale; - } -} - -void FullyConnected::needSplitScaleForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_scale) { - auto scale_mem = std::const_pointer_cast(memory); - tp_cfg.cached_scale = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), std::move(scale_mem), 0, tp_cfg.w_rank, tp_cfg.w_size); - } -} - -void FullyConnected::needUpdateZeroPointForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_zeropoint) { - attrs.decompressionSubtractPtr = tp_cfg.cached_zeropoint; - } -} - -void FullyConnected::needSplitZeroPointForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_zeropoint) { - auto zeropoint_mem = std::const_pointer_cast(memory); - auto element_num = memory->getSize() / memory->getPrecision().size(); - if (element_num == 1) { - tp_cfg.cached_zeropoint = std::move(zeropoint_mem); - } else { - tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); - } - } -} - -void FullyConnected::fuseDecompressionMultiply(const MemoryCPtr& memory) { - attrs.decompressionMultiplyPtr = memory; - needSplitScaleForTensorParallel(memory); -} - -void FullyConnected::fuseDecompressionSubtract(const MemoryCPtr& memory) { - attrs.decompressionSubtractPtr = memory; - needSplitZeroPointForTensorParallel(memory); -} - } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index be29342b851988..0b50d882c9e554 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -6,17 +6,18 @@ #include +#include #include #include #include +#include #include #include "cpu_memory.h" #include "nodes/executors/executor_factory.hpp" -#include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/fullyconnected_config.hpp" +#include "nodes/executors/memory_arguments.hpp" #include "post_ops.hpp" -#include "openvino/runtime/threading/cpu_message.hpp" namespace ov { namespace intel_cpu { @@ -66,6 +67,15 @@ class FullyConnected : public Node { bool canFuse(const NodePtr& node) const override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedCompressedOperation(const std::shared_ptr& op, + size_t IC, + size_t OC, + size_t G, + ov::element::Type inferencePrecision) noexcept; + + bool isExecutable() const override { + return !isInputTensorAtPortEmpty(0); + } void prepareParams() override; void executeDynamicImpl(dnnl::stream strm) override; @@ -81,11 +91,22 @@ class FullyConnected : public Node { void toNumaNodeImpl(int numaID) override; private: - static const size_t DATA_ID = 0; - static const size_t WEIGHTS_ID = 1; - static const size_t BIAS_ID = 2; + enum InputId : size_t { + DATA = 0, + WEIGHTS, + BIAS, + WEIGHT_SCALES, + WEIGHT_ZERO_POINTS, + INPUT_SCALES, + INPUT_ZERO_POINTS, + OUTPUT_SCALES, + OUTPUT_ZERO_POINTS, + }; + + static bool isConstantInput(const std::shared_ptr& op, InputId port); + + std::unordered_map m_atoi; // memory argument id to input id - ExecutorPtr createExecutor(); void fuseDecompressionConstant(const MemoryCPtr& memory, MemoryCPtr& decompressionValuesPtr); void initTensorParallelConfig(const GraphContext::CPtr context); @@ -94,16 +115,11 @@ class FullyConnected : public Node { void initTensorParallelSync(); void execTensorParallelSync(); void needSplitMemoryForTensorParallel(); - void needSplitScaleForTensorParallel(const MemoryCPtr& memory); - void needUpdateScaleForTensorParallel(); - void needSplitZeroPointForTensorParallel(const MemoryCPtr& memory); - void needUpdateZeroPointForTensorParallel(); - void needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales); FCAttrs attrs; PostOps postOps; MemoryArgs memory; - ExecutorFactoryPtr factory; + ExecutorFactoryPtr factory; ExecutorPtr executor = nullptr; std::string errorPrefix; diff --git a/src/plugins/intel_cpu/src/nodes/gather.h b/src/plugins/intel_cpu/src/nodes/gather.h index 6ee097e9a1fbab..c20a56807b0165 100644 --- a/src/plugins/intel_cpu/src/nodes/gather.h +++ b/src/plugins/intel_cpu/src/nodes/gather.h @@ -5,12 +5,13 @@ #pragma once #include -#include "kernels/x64/gather_uni_kernel.hpp" #include #include #include +#include "kernels/x64/gather_uni_kernel.hpp" + namespace ov { namespace intel_cpu { namespace node { @@ -19,7 +20,7 @@ class Gather : public Node { public: Gather(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void createPrimitive() override; void execute(dnnl::stream strm) override; @@ -115,6 +116,6 @@ class Gather : public Node { std::shared_ptr jitKernel; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_elements.cpp b/src/plugins/intel_cpu/src/nodes/gather_elements.cpp index 8653bda8c483d3..d8f221dcebf34d 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_elements.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather_elements.cpp @@ -2,23 +2,25 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "gather_elements.h" + #include -#include #include +#include + +#include "common/cpu_memcpy.h" #include "openvino/core/parallel.hpp" -#include "gather_elements.h" #include "openvino/opsets/opset1.hpp" #include "utils/general_utils.h" -#include "common/cpu_memcpy.h" namespace ov { namespace intel_cpu { namespace node { -bool GatherElements::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool GatherElements::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), - ov::op::v6::GatherElements::get_type_info_static())) { + if (!one_of(op->get_type_info(), ov::op::v6::GatherElements::get_type_info_static())) { errorMessage = "Node is not an instance of the GatherElements operation from operation set v6."; return false; } @@ -88,8 +90,7 @@ void GatherElements::initSupportedPrimitiveDescriptors() { dataTypeSize_ = inDataPrecision.size(); - addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, - {LayoutType::ncsp, ov::element::i32}}, + addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, {LayoutType::ncsp, ov::element::i32}}, {{LayoutType::ncsp, inDataPrecision}}, impl_desc_type::ref_any); } @@ -100,9 +101,9 @@ void GatherElements::executeDynamicImpl(dnnl::stream strm) { template void GatherElements::directExecution() { - const auto *srcData = getSrcDataAtPortAs(dataIndex_); - const auto *indices = getSrcDataAtPortAs(indicesIndex_); - auto *dstData = getDstDataAtPortAs(0); + const auto* srcData = getSrcDataAtPortAs(dataIndex_); + const auto* indices = getSrcDataAtPortAs(indicesIndex_); + auto* dstData = getDstDataAtPortAs(0); const int outSize = getChildEdgeAt(0)->getMemory().getShape().getElementsCount(); auto threadBody = [&](const int ithr, const int nthr) { @@ -133,14 +134,14 @@ void GatherElements::directExecution() { void GatherElements::execute(dnnl::stream strm) { switch (dataTypeSize_) { - case sizeof(element_type_traits::value_type): - return directExecution::value_type>(); - case sizeof(element_type_traits::value_type): - return directExecution::value_type>(); - case sizeof(element_type_traits::value_type): - return directExecution::value_type>(); - default: - OPENVINO_THROW("Unsupported data type size"); + case sizeof(element_type_traits::value_type): + return directExecution::value_type>(); + case sizeof(element_type_traits::value_type): + return directExecution::value_type>(); + case sizeof(element_type_traits::value_type): + return directExecution::value_type>(); + default: + OPENVINO_THROW("Unsupported data type size"); } } @@ -148,6 +149,6 @@ bool GatherElements::created() const { return getType() == Type::GatherElements; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_elements.h b/src/plugins/intel_cpu/src/nodes/gather_elements.h index 3c2282401f7431..b050cd4e523490 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_elements.h +++ b/src/plugins/intel_cpu/src/nodes/gather_elements.h @@ -14,7 +14,7 @@ class GatherElements : public Node { public: GatherElements(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -40,6 +40,6 @@ class GatherElements : public Node { void directExecution(); }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_nd.cpp b/src/plugins/intel_cpu/src/nodes/gather_nd.cpp index 8c81f9b770a687..e962839e571663 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_nd.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather_nd.cpp @@ -2,15 +2,17 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "gather_nd.h" + #include -#include +#include #include +#include + +#include "common/cpu_memcpy.h" #include "dnnl_types.h" #include "openvino/core/parallel.hpp" -#include "gather_nd.h" -#include #include "utils/general_utils.h" -#include "common/cpu_memcpy.h" #define THROW_ERROR(...) OPENVINO_THROW("GatherND layer with name '", getName(), "' ", __VA_ARGS__) @@ -20,7 +22,9 @@ namespace node { bool GatherND::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - if (!one_of(op->get_type_info(), ov::op::v5::GatherND::get_type_info_static(), ov::op::v8::GatherND::get_type_info_static())) { + if (!one_of(op->get_type_info(), + ov::op::v5::GatherND::get_type_info_static(), + ov::op::v8::GatherND::get_type_info_static())) { errorMessage = "Node is not an instance of the GatherND operation from operation set v5 and v8."; return false; } @@ -70,12 +74,16 @@ void GatherND::initSupportedPrimitiveDescriptors() { ov::element::Type indicesPrecision = getOriginalInputPrecisionAtPort(GATHERND_INDEXES); if (!one_of(indicesPrecision, - ov::element::i32, ov::element::i64, ov::element::i16, ov::element::u16, ov::element::i8, ov::element::u8)) { + ov::element::i32, + ov::element::i64, + ov::element::i16, + ov::element::u16, + ov::element::i8, + ov::element::u8)) { THROW_ERROR("has unsupported 'indices' input precision: ", indicesPrecision); } - addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, - {LayoutType::ncsp, ov::element::i32}}, + addSupportedPrimDesc({{LayoutType::ncsp, inDataPrecision}, {LayoutType::ncsp, ov::element::i32}}, {{LayoutType::ncsp, inDataPrecision}}, impl_desc_type::ref_any); } @@ -96,24 +104,33 @@ void GatherND::prepareParams() { attrs.srcDims = srcMemPtr->getStaticDims(); attrs.srcStrides = srcMemPtr->getDescWithType()->getStrides(); attrs.dstElementCount = dstMemPtr->getShape().getElementsCount(); - attrs.sliceRank = idxMemPtr->getStaticDims().back(); + attrs.sliceRank = idxMemPtr->getStaticDims().back(); execPtr = std::make_shared(attrs); } -GatherND::GatherNDExecutor::GatherNDExecutor(const GatherNDAttributes& attrs) : sliceRank(attrs.sliceRank), dataSize(attrs.dataSize) { - batchSize = std::accumulate(attrs.srcDims.begin(), attrs.srcDims.begin() + attrs.batchDims, size_t(1), std::multiplies()); - dataLength = std::accumulate(attrs.srcDims.begin() + sliceRank + attrs.batchDims, attrs.srcDims.end(), size_t(1), +GatherND::GatherNDExecutor::GatherNDExecutor(const GatherNDAttributes& attrs) + : sliceRank(attrs.sliceRank), + dataSize(attrs.dataSize) { + batchSize = std::accumulate(attrs.srcDims.begin(), + attrs.srcDims.begin() + attrs.batchDims, + size_t(1), + std::multiplies()); + dataLength = std::accumulate(attrs.srcDims.begin() + sliceRank + attrs.batchDims, + attrs.srcDims.end(), + size_t(1), std::multiplies()); cycles = attrs.dstElementCount / (dataLength * batchSize); workAmount = batchSize * cycles; - srcBatchStride = std::accumulate(attrs.srcDims.begin() + attrs.batchDims, attrs.srcDims.end(), size_t(1), + srcBatchStride = std::accumulate(attrs.srcDims.begin() + attrs.batchDims, + attrs.srcDims.end(), + size_t(1), std::multiplies()); idxBatchStride = cycles * sliceRank; dstBatchStride = cycles * dataLength; srcShifts.resize(attrs.sliceRank, 0); - for (size_t i = 0; i < attrs.sliceRank ; i++) + for (size_t i = 0; i < attrs.sliceRank; i++) srcShifts[i] = attrs.srcStrides[i + attrs.batchDims] * (dataLength > 1 ? dataSize : 1); // optimized implementation 'blocks' via memcpy @@ -128,25 +145,33 @@ void GatherND::execute(dnnl::stream strm) { if (!execPtr) THROW_ERROR("has not compiled executor."); - execPtr->exec(getSrcMemoryAtPort(GATHERND_DATA), - getSrcMemoryAtPort(GATHERND_INDEXES), - getDstMemoryAtPort(0)); + execPtr->exec(getSrcMemoryAtPort(GATHERND_DATA), getSrcMemoryAtPort(GATHERND_INDEXES), getDstMemoryAtPort(0)); } -void GatherND::GatherNDExecutor::exec(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { +void GatherND::GatherNDExecutor::exec(const MemoryPtr& srcMemPtr, + const MemoryPtr& idxMemPtr, + const MemoryPtr& dstMemPtr) { if (dataLength > 1) { gatherBlocks(srcMemPtr, idxMemPtr, dstMemPtr); return; } - GatherNDContext ctx { this, srcMemPtr, idxMemPtr, dstMemPtr }; - OV_SWITCH(intel_cpu, GatherNDEmitter, ctx, dataSize, - OV_CASE(sizeof(element_type_traits::value_type), element_type_traits::value_type), - OV_CASE(sizeof(element_type_traits::value_type), element_type_traits::value_type), - OV_CASE(sizeof(element_type_traits::value_type), element_type_traits::value_type)); + GatherNDContext ctx{this, srcMemPtr, idxMemPtr, dstMemPtr}; + OV_SWITCH(intel_cpu, + GatherNDEmitter, + ctx, + dataSize, + OV_CASE(sizeof(element_type_traits::value_type), + element_type_traits::value_type), + OV_CASE(sizeof(element_type_traits::value_type), + element_type_traits::value_type), + OV_CASE(sizeof(element_type_traits::value_type), + element_type_traits::value_type)); } -void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { +void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, + const MemoryPtr& idxMemPtr, + const MemoryPtr& dstMemPtr) { const uint8_t* srcData = srcMemPtr->getDataAs(); const int32_t* indices = idxMemPtr->getDataAs(); uint8_t* dstData = dstMemPtr->getDataAs(); @@ -183,7 +208,9 @@ void GatherND::GatherNDExecutor::gatherBlocks(const MemoryPtr& srcMemPtr, const } template -void GatherND::GatherNDExecutor::gatherElementwise(const MemoryPtr& srcMemPtr, const MemoryPtr& idxMemPtr, const MemoryPtr& dstMemPtr) { +void GatherND::GatherNDExecutor::gatherElementwise(const MemoryPtr& srcMemPtr, + const MemoryPtr& idxMemPtr, + const MemoryPtr& dstMemPtr) { const dataType* srcData = srcMemPtr->getDataAs(); const int32_t* indices = idxMemPtr->getDataAs(); dataType* dstData = dstMemPtr->getDataAs(); @@ -227,6 +254,6 @@ bool GatherND::created() const { return getType() == Type::GatherND; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_nd.h b/src/plugins/intel_cpu/src/nodes/gather_nd.h index ed643a2da08899..312cb465bf9e6c 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_nd.h +++ b/src/plugins/intel_cpu/src/nodes/gather_nd.h @@ -14,7 +14,7 @@ class GatherND : public Node { public: GatherND(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -65,7 +65,7 @@ class GatherND : public Node { MemoryPtr dstMemPtr; }; - template + template struct GatherNDEmitter { void operator()(GatherNDContext& ctx) { ctx.executor->gatherElementwise(ctx.srcMemPtr, ctx.idxMemPtr, ctx.dstMemPtr); @@ -80,6 +80,6 @@ class GatherND : public Node { executorPtr execPtr = nullptr; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_tree.cpp b/src/plugins/intel_cpu/src/nodes/gather_tree.cpp index 5834cd1e1048ba..2ff9a1ccdb8f59 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_tree.cpp +++ b/src/plugins/intel_cpu/src/nodes/gather_tree.cpp @@ -2,13 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/op/gather_tree.hpp" + +#include #include #include -#include -#include "openvino/op/gather_tree.hpp" -#include "openvino/core/parallel.hpp" #include "gather_tree.h" +#include "openvino/core/parallel.hpp" #include "utils/general_utils.h" namespace ov { @@ -59,11 +60,11 @@ void GatherTree::initSupportedPrimitiveDescriptors() { if (!one_of(precision, ov::element::f32, ov::element::i32)) precision = ov::element::f32; - if (getOriginalInputPrecisionAtPort(GATHER_TREE_PARENT_IDX) != precision || + if (getOriginalInputPrecisionAtPort(GATHER_TREE_PARENT_IDX) != precision || getOriginalInputPrecisionAtPort(GATHER_TREE_MAX_SEQ_LEN) != precision || - getOriginalInputPrecisionAtPort(GATHER_TREE_END_TOKEN) != precision || - getOriginalOutputPrecisionAtPort(0) != precision) { - OPENVINO_THROW(errorPrefix, " has incorrect input/output data precision. Must be the same."); + getOriginalInputPrecisionAtPort(GATHER_TREE_END_TOKEN) != precision || + getOriginalOutputPrecisionAtPort(0) != precision) { + OPENVINO_THROW(errorPrefix, " has incorrect input/output data precision. Must be the same."); } addSupportedPrimDesc({{LayoutType::ncsp, precision}, @@ -121,13 +122,15 @@ void GatherTree::executeDynamicImpl(dnnl::stream strm) { execute(strm); } -GatherTree::GatherTreeExecutor::GatherTreeExecutor(const VectorDims& stepIdxDims, const VectorDims& parentIdxDims, - const VectorDims& maxSeqLenDims, const VectorDims& dstDims) - : maxTime{static_cast(stepIdxDims[0])} - , batchSize{stepIdxDims[1]} - , beamWidth{stepIdxDims[2]} - , bbSize{batchSize * beamWidth} - , parentIdxSize{std::accumulate(parentIdxDims.cbegin(), parentIdxDims.cend(), 1lu, std::multiplies())} { +GatherTree::GatherTreeExecutor::GatherTreeExecutor(const VectorDims& stepIdxDims, + const VectorDims& parentIdxDims, + const VectorDims& maxSeqLenDims, + const VectorDims& dstDims) + : maxTime{static_cast(stepIdxDims[0])}, + batchSize{stepIdxDims[1]}, + beamWidth{stepIdxDims[2]}, + bbSize{batchSize * beamWidth}, + parentIdxSize{std::accumulate(parentIdxDims.cbegin(), parentIdxDims.cend(), 1lu, std::multiplies())} { if (maxTime != static_cast(parentIdxDims[0]) || maxTime != static_cast(dstDims[0]) || batchSize != parentIdxDims[1] || batchSize != dstDims[1] || batchSize != maxSeqLenDims[0] || beamWidth != parentIdxDims[2] || beamWidth != dstDims[2]) { @@ -136,14 +139,17 @@ GatherTree::GatherTreeExecutor::GatherTreeExecutor(const VectorDims& stepIdxDims } } -template -void GatherTree::GatherTreeExecutor::exec(const MemoryPtr& stepIdxMemPtr, const MemoryPtr& parentIdxMemPtr, - const MemoryPtr& maxSeqLenMemPtr, const MemoryPtr& endTokenMemPtr, const MemoryPtr& dstMemPtr) { - const auto *stepIdx = stepIdxMemPtr->getDataAs(); - const auto *parentIdx = parentIdxMemPtr->getDataAs(); - const auto *maxSeqLen = maxSeqLenMemPtr->getDataAs(); +template +void GatherTree::GatherTreeExecutor::exec(const MemoryPtr& stepIdxMemPtr, + const MemoryPtr& parentIdxMemPtr, + const MemoryPtr& maxSeqLenMemPtr, + const MemoryPtr& endTokenMemPtr, + const MemoryPtr& dstMemPtr) { + const auto* stepIdx = stepIdxMemPtr->getDataAs(); + const auto* parentIdx = parentIdxMemPtr->getDataAs(); + const auto* maxSeqLen = maxSeqLenMemPtr->getDataAs(); const auto endToken = (endTokenMemPtr->getDataAs())[0]; - auto *finalIdx = dstMemPtr->getDataAs(); + auto* finalIdx = dstMemPtr->getDataAs(); bool incorrectResult = false; parallel_for2d(batchSize, beamWidth, [&](size_t batch, size_t beam) { @@ -164,7 +170,7 @@ void GatherTree::GatherTreeExecutor::exec(const MemoryPtr& stepIdxMemPtr, const } bool finished = false; - auto *final = &finalIdx[batch * beamWidth + beam]; + auto* final = &finalIdx[batch * beamWidth + beam]; for (time = 0; time < maxSequenceInBeam; time++, final += bbSize) { if (finished) (*final) = endToken; @@ -184,6 +190,6 @@ bool GatherTree::created() const { return getType() == Type::GatherTree; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/gather_tree.h b/src/plugins/intel_cpu/src/nodes/gather_tree.h index 69d63f834b555d..9874fceb835ba5 100644 --- a/src/plugins/intel_cpu/src/nodes/gather_tree.h +++ b/src/plugins/intel_cpu/src/nodes/gather_tree.h @@ -14,7 +14,7 @@ class GatherTree : public Node { public: GatherTree(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -32,7 +32,7 @@ class GatherTree : public Node { const VectorDims& dstDims); ~GatherTreeExecutor() = default; - template + template void exec(const MemoryPtr& stepIdxMemPtr, const MemoryPtr& parentIdxMemPtr, const MemoryPtr& maxSeqLenMemPtr, @@ -60,6 +60,6 @@ class GatherTree : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp b/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp index ae32e1e4729096..0ed50c7b0d73a8 100644 --- a/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp +++ b/src/plugins/intel_cpu/src/nodes/generate_proposals.cpp @@ -2,22 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include #include #include +#include #include -#include #include -#include +#include #if defined(HAVE_AVX2) -#include +# include #endif -#include "openvino/op/generate_proposals.hpp" -#include "openvino/core/parallel.hpp" #include "common/cpu_memcpy.h" #include "generate_proposals.h" +#include "openvino/core/parallel.hpp" +#include "openvino/op/generate_proposals.hpp" #include "shape_inference/shape_inference_internal_dyn.hpp" namespace ov { @@ -30,21 +30,29 @@ struct Indexer4d { int dim23_; int dim123_; - explicit Indexer4d(int dim0, int dim1, int dim2, int dim3): - dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) { + explicit Indexer4d(int dim0, int dim1, int dim2, int dim3) + : dim3_(dim3), + dim23_(dim2 * dim3), + dim123_(dim1 * dim2 * dim3) { (void)dim0; } int operator()(int i, int j, int k, int n) const { - return i * dim123_ + j * dim23_ + k * dim3_ + n; + return i * dim123_ + j * dim23_ + k * dim3_ + n; } }; - -void refine_anchors(const float* deltas, const float* scores, const float* anchors, - float* proposals, const int anchors_num, const int bottom_H, - const int bottom_W, const float img_H, const float img_W, - const float min_box_H, const float min_box_W, +void refine_anchors(const float* deltas, + const float* scores, + const float* anchors, + float* proposals, + const int anchors_num, + const int bottom_H, + const int bottom_W, + const float img_H, + const float img_W, + const float min_box_H, + const float min_box_W, const float max_delta_log_wh, float coordinates_offset) { Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W); @@ -111,18 +119,23 @@ void refine_anchors(const float* deltas, const float* scores, const float* ancho void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int* is_dead, int pre_nms_topn) { parallel_for(pre_nms_topn, [&](size_t i) { - unpacked_boxes[0*pre_nms_topn + i] = p_proposals[6*i + 0]; - unpacked_boxes[1*pre_nms_topn + i] = p_proposals[6*i + 1]; - unpacked_boxes[2*pre_nms_topn + i] = p_proposals[6*i + 2]; - unpacked_boxes[3*pre_nms_topn + i] = p_proposals[6*i + 3]; - unpacked_boxes[4*pre_nms_topn + i] = p_proposals[6*i + 4]; - is_dead[i] = (p_proposals[6*i + 5] == 1.0) ? 0 : 1; + unpacked_boxes[0 * pre_nms_topn + i] = p_proposals[6 * i + 0]; + unpacked_boxes[1 * pre_nms_topn + i] = p_proposals[6 * i + 1]; + unpacked_boxes[2 * pre_nms_topn + i] = p_proposals[6 * i + 2]; + unpacked_boxes[3 * pre_nms_topn + i] = p_proposals[6 * i + 3]; + unpacked_boxes[4 * pre_nms_topn + i] = p_proposals[6 * i + 4]; + is_dead[i] = (p_proposals[6 * i + 5] == 1.0) ? 0 : 1; }); } -void nms_cpu(const int num_boxes, int is_dead[], - const float* boxes, int index_out[], size_t* const num_out, - const int base_index, const float nms_thresh, const int max_num_out, +void nms_cpu(const int num_boxes, + int is_dead[], + const float* boxes, + int index_out[], + size_t* const num_out, + const int base_index, + const float nms_thresh, + const int max_num_out, float coordinates_offset) { const int num_proposals = num_boxes; size_t count = 0; @@ -133,9 +146,9 @@ void nms_cpu(const int num_boxes, int is_dead[], const float* y1 = boxes + 3 * num_proposals; #if defined(HAVE_AVX2) - __m256 vc_fone = _mm256_set1_ps(coordinates_offset); + __m256 vc_fone = _mm256_set1_ps(coordinates_offset); __m256i vc_ione = _mm256_set1_epi32(1); - __m256 vc_zero = _mm256_set1_ps(0.0f); + __m256 vc_zero = _mm256_set1_ps(0.0f); __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh); #endif @@ -156,13 +169,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1i = _mm256_set1_ps(x1[box]); __m256 vy1i = _mm256_set1_ps(y1[box]); - __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); + __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); __m256 vA_height = _mm256_sub_ps(vy1i, vy0i); - __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); + __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); for (; tail <= num_boxes - 8; tail += 8) { - __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail); - __m256i vdst = _mm256_loadu_si256(pdst); + __m256i* pdst = reinterpret_cast<__m256i*>(is_dead + tail); + __m256i vdst = _mm256_loadu_si256(pdst); __m256 vx0j = _mm256_loadu_ps(x0 + tail); __m256 vy0j = _mm256_loadu_ps(y0 + tail); @@ -174,13 +187,13 @@ void nms_cpu(const int num_boxes, int is_dead[], __m256 vx1 = _mm256_min_ps(vx1i, vx1j); __m256 vy1 = _mm256_min_ps(vy1i, vy1j); - __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); + __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone); __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight)); - __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); + __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); __m256 vB_height = _mm256_sub_ps(vy1j, vy0j); - __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); + __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea); __m256 vintersection_area = _mm256_div_ps(varea, vdivisor); @@ -221,9 +234,9 @@ void nms_cpu(const int num_boxes, int is_dead[], const float y1 = std::min(y1i, y1j); // intersection area - const float width = std::max(0.0f, x1 - x0 + coordinates_offset); - const float height = std::max(0.0f, y1 - y0 + coordinates_offset); - const float area = width * height; + const float width = std::max(0.0f, x1 - x0 + coordinates_offset); + const float height = std::max(0.0f, y1 - y0 + coordinates_offset); + const float area = width * height; // area of A, B const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset); @@ -241,16 +254,20 @@ void nms_cpu(const int num_boxes, int is_dead[], *num_out = count; } - -void fill_output_blobs(const float* proposals, const int* roi_indices, - float* rois, float* scores, uint8_t* roi_num, - const int num_proposals, const size_t num_rois, const int post_nms_topn, +void fill_output_blobs(const float* proposals, + const int* roi_indices, + float* rois, + float* scores, + uint8_t* roi_num, + const int num_proposals, + const size_t num_rois, + const int post_nms_topn, ov::element::Type roi_num_type) { - const float *src_x0 = proposals + 0 * num_proposals; - const float *src_y0 = proposals + 1 * num_proposals; - const float *src_x1 = proposals + 2 * num_proposals; - const float *src_y1 = proposals + 3 * num_proposals; - const float *src_score = proposals + 4 * num_proposals; + const float* src_x0 = proposals + 0 * num_proposals; + const float* src_y0 = proposals + 1 * num_proposals; + const float* src_x1 = proposals + 2 * num_proposals; + const float* src_y1 = proposals + 3 * num_proposals; + const float* src_score = proposals + 4 * num_proposals; parallel_for(num_rois, [&](size_t i) { int index = roi_indices[i]; @@ -274,8 +291,8 @@ void fill_output_blobs(const float* proposals, const int* roi_indices, } // namespace -bool GenerateProposals::isSupportedOperation - (const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool GenerateProposals::isSupportedOperation(const std::shared_ptr& op, + std::string& errorMessage) noexcept { try { if (!ov::as_type_ptr(op)) { errorMessage = "Node is not an instance of the Proposal from the operations set v0."; @@ -332,13 +349,13 @@ void GenerateProposals::execute(dnnl::stream strm) { } size_t anchor_dims_size = 1; - const auto &anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); + const auto& anchorDims = getParentEdgeAt(INPUT_ANCHORS)->getMemory().getStaticDims(); for (size_t i = 0; i < anchorDims.size(); i++) { anchor_dims_size *= anchorDims[i]; } size_t deltas_dims_size = 1; - const auto &deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); + const auto& deltaDims = getParentEdgeAt(INPUT_DELTAS)->getMemory().getStaticDims(); for (size_t i = 1; i < deltaDims.size(); i++) { deltas_dims_size *= deltaDims[i]; } @@ -346,7 +363,7 @@ void GenerateProposals::execute(dnnl::stream strm) { OPENVINO_THROW("'Anchors' blob size for GenerateProposals is incompatible with 'deltas' blob size!"); size_t score_dims_size = 1; - const auto &scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); + const auto& scoreDims = getParentEdgeAt(INPUT_SCORES)->getMemory().getStaticDims(); for (size_t i = 1; i < scoreDims.size(); i++) { score_dims_size *= scoreDims[i]; } @@ -354,16 +371,16 @@ void GenerateProposals::execute(dnnl::stream strm) { OPENVINO_THROW("'Deltas' blob size for GenerateProposals is incompatible with 'scores' blob size!"); size_t im_info_dims_size = 1; - const auto &infoDims = getParentEdgeAt(INPUT_IM_INFO)->getMemory().getStaticDims(); + const auto& infoDims = getParentEdgeAt(INPUT_IM_INFO)->getMemory().getStaticDims(); for (size_t i = 1; i < infoDims.size(); i++) { im_info_dims_size *= infoDims[i]; } // Prepare memory - const float *p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); - const float *p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); - const float *p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); - const float *p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); + const float* p_deltas_item = getSrcDataAtPortAs(INPUT_DELTAS); + const float* p_scores_item = getSrcDataAtPortAs(INPUT_SCORES); + const float* p_anchors_item = getSrcDataAtPortAs(INPUT_ANCHORS); + const float* p_img_info_cpu = getSrcDataAtPortAs(INPUT_IM_INFO); const int anchors_num = scoreDims[1]; @@ -422,27 +439,50 @@ void GenerateProposals::execute(dnnl::stream strm) { const float min_box_H = min_size_ * scale_h; const float min_box_W = min_size_ * scale_w; - refine_anchors(p_deltas_item, p_scores_item, p_anchors_item, - reinterpret_cast(&proposals_[0]), anchors_num, bottom_H, - bottom_W, img_H, img_W, - min_box_H, min_box_W, + refine_anchors(p_deltas_item, + p_scores_item, + p_anchors_item, + reinterpret_cast(&proposals_[0]), + anchors_num, + bottom_H, + bottom_W, + img_H, + img_W, + min_box_H, + min_box_W, static_cast(std::log(1000. / 16.)), coordinates_offset_); - std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), - [](const ProposalBox &struct1, const ProposalBox &struct2) { + std::partial_sort(proposals_.begin(), + proposals_.begin() + pre_nms_topn, + proposals_.end(), + [](const ProposalBox& struct1, const ProposalBox& struct2) { return (struct1.score > struct2.score); }); - unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], &is_dead[0], pre_nms_topn); - nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, - nms_thresh_, post_nms_topn_, coordinates_offset_); + unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], &is_dead[0], pre_nms_topn); + nms_cpu(pre_nms_topn, + &is_dead[0], + &unpacked_boxes[0], + &roi_indices_[0], + &num_rois, + 0, + nms_thresh_, + post_nms_topn_, + coordinates_offset_); size_t new_num_rois = total_num_rois + num_rois; roi_item.resize(new_num_rois * 4); score_item.resize(new_num_rois); - fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], &roi_item[total_num_rois * 4], &score_item[total_num_rois], - p_roi_num, pre_nms_topn, num_rois, post_nms_topn_, roi_num_type); + fill_output_blobs(&unpacked_boxes[0], + &roi_indices_[0], + &roi_item[total_num_rois * 4], + &score_item[total_num_rois], + p_roi_num, + pre_nms_topn, + num_rois, + post_nms_topn_, + roi_num_type); p_deltas_item += deltas_dims_size; p_scores_item += score_dims_size; p_img_info_cpu += im_info_dims_size; @@ -451,13 +491,13 @@ void GenerateProposals::execute(dnnl::stream strm) { } // copy to out memory redefineOutputMemory({VectorDims{total_num_rois, 4}, VectorDims{total_num_rois}, VectorDims{batch_size}}); - float *p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); - float *p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); + float* p_roi_item = getDstDataAtPortAs(OUTPUT_ROIS); + float* p_roi_score_item = getDstDataAtPortAs(OUTPUT_SCORES); uint8_t* p_roi_num_item = getDstDataAtPortAs(OUTPUT_ROI_NUM); memcpy(p_roi_item, &roi_item[0], roi_item.size() * sizeof(float)); memcpy(p_roi_score_item, &score_item[0], score_item.size() * sizeof(float)); memcpy(p_roi_num_item, &roi_num[0], getDstMemoryAtPort(OUTPUT_ROI_NUM)->getSize()); - } catch (const std::exception &e) { + } catch (const std::exception& e) { std::string errorMsg = e.what(); OPENVINO_THROW(errorMsg); } @@ -475,6 +515,6 @@ bool GenerateProposals::needPrepareParams() const { return false; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/generate_proposals.h b/src/plugins/intel_cpu/src/nodes/generate_proposals.h index 5438f30011d986..666338eed3d4aa 100644 --- a/src/plugins/intel_cpu/src/nodes/generate_proposals.h +++ b/src/plugins/intel_cpu/src/nodes/generate_proposals.h @@ -14,7 +14,7 @@ class GenerateProposals : public Node { public: GenerateProposals(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -35,13 +35,13 @@ class GenerateProposals : public Node { // scores, shape [rois_num] // roi_num, shape [N] - const int INPUT_IM_INFO {0}; - const int INPUT_ANCHORS {1}; - const int INPUT_DELTAS {2}; - const int INPUT_SCORES {3}; - const int OUTPUT_ROIS {0}; - const int OUTPUT_SCORES {1}; - const int OUTPUT_ROI_NUM {2}; + const int INPUT_IM_INFO{0}; + const int INPUT_ANCHORS{1}; + const int INPUT_DELTAS{2}; + const int INPUT_SCORES{3}; + const int OUTPUT_ROIS{0}; + const int OUTPUT_SCORES{1}; + const int OUTPUT_ROI_NUM{2}; float min_size_ = 0.f; int pre_nms_topn_ = 0; @@ -52,6 +52,6 @@ class GenerateProposals : public Node { std::vector roi_indices_; }; -} // namespace node +} // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp index c8b73360539b68..9f346a2db14dac 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.cpp @@ -3,15 +3,16 @@ // #include "grid_sample.hpp" -#include "openvino/op/grid_sample.hpp" + #include "openvino/core/parallel.hpp" +#include "openvino/op/grid_sample.hpp" using namespace ov::intel_cpu; using namespace ov::intel_cpu::node; #if defined(OPENVINO_ARCH_X86_64) using namespace dnnl::impl::cpu; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 #define THROW_ERROR(...) OPENVINO_THROW(getTypeStr(), " node with name '", getName(), "' ", __VA_ARGS__) @@ -28,7 +29,7 @@ bool GridSample::isSupportedOperation(const std::shared_ptr& op, } #else return false; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 } catch (...) { return false; } @@ -61,30 +62,30 @@ GridSample::GridSample(const std::shared_ptr& op, const GraphContext:: const auto& attributes = ov::as_type_ptr(op)->get_attributes(); alignCorners = attributes.align_corners; switch (attributes.mode) { - case op::v9::GridSample::InterpolationMode::BILINEAR: - interpolationMode = GridSampleInterpolationMode::BILINEAR; - break; - case op::v9::GridSample::InterpolationMode::BICUBIC: - interpolationMode = GridSampleInterpolationMode::BICUBIC; - break; - case op::v9::GridSample::InterpolationMode::NEAREST: - interpolationMode = GridSampleInterpolationMode::NEAREST; - break; - default: - THROW_CPU_NODE_ERR("supports only BILINEAR, BICUBIC, NEAREST interpolation modes."); + case op::v9::GridSample::InterpolationMode::BILINEAR: + interpolationMode = GridSampleInterpolationMode::BILINEAR; + break; + case op::v9::GridSample::InterpolationMode::BICUBIC: + interpolationMode = GridSampleInterpolationMode::BICUBIC; + break; + case op::v9::GridSample::InterpolationMode::NEAREST: + interpolationMode = GridSampleInterpolationMode::NEAREST; + break; + default: + THROW_CPU_NODE_ERR("supports only BILINEAR, BICUBIC, NEAREST interpolation modes."); } switch (attributes.padding_mode) { - case op::v9::GridSample::PaddingMode::ZEROS: - paddingMode = GridSamplePaddingMode::ZEROS; - break; - case op::v9::GridSample::PaddingMode::BORDER: - paddingMode = GridSamplePaddingMode::BORDER; - break; - case op::v9::GridSample::PaddingMode::REFLECTION: - paddingMode = GridSamplePaddingMode::REFLECTION; - break; - default: - THROW_CPU_NODE_ERR("supports only BORDER, REFLECTION, ZEROS paddings modes."); + case op::v9::GridSample::PaddingMode::ZEROS: + paddingMode = GridSamplePaddingMode::ZEROS; + break; + case op::v9::GridSample::PaddingMode::BORDER: + paddingMode = GridSamplePaddingMode::BORDER; + break; + case op::v9::GridSample::PaddingMode::REFLECTION: + paddingMode = GridSamplePaddingMode::REFLECTION; + break; + default: + THROW_CPU_NODE_ERR("supports only BORDER, REFLECTION, ZEROS paddings modes."); } } @@ -107,8 +108,7 @@ void GridSample::initSupportedPrimitiveDescriptors() { } // 95905 - to add nspc layout support. - addSupportedPrimDesc({{LayoutType::ncsp, dataPrecision}, - {LayoutType::ncsp, gridPrecision}}, + addSupportedPrimDesc({{LayoutType::ncsp, dataPrecision}, {LayoutType::ncsp, gridPrecision}}, {{LayoutType::ncsp, dataPrecision}}, implType); } @@ -116,25 +116,26 @@ void GridSample::initSupportedPrimitiveDescriptors() { void GridSample::createPrimitive() { kernel::GridSampleKernelConfParams jcp; - jcp.inDataPrc = dataPrecision; - jcp.gridPrc = gridPrecision; + jcp.inDataPrc = dataPrecision; + jcp.gridPrc = gridPrecision; jcp.dynamicShapes = isDynamicNode(); - jcp.alignCorners = alignCorners; + jcp.alignCorners = alignCorners; jcp.interpolationMode = interpolationMode; - jcp.paddingMode = paddingMode; + jcp.paddingMode = paddingMode; const auto& srcDataDims = getInputShapeAtPort(IN_DATA).getDims(); if (!jcp.dynamicShapes) { - jcp.batchNum = srcDataDims[0]; - jcp.cannelNum = srcDataDims[1]; - jcp.dynamicBatch = false; + jcp.batchNum = srcDataDims[0]; + jcp.cannelNum = srcDataDims[1]; + jcp.dynamicBatch = false; jcp.dynamicChannel = false; - jcp.srcBatchStepB = std::accumulate(srcDataDims.begin() + 1, srcDataDims.end(), dataTypeSize, std::multiplies()); + jcp.srcBatchStepB = + std::accumulate(srcDataDims.begin() + 1, srcDataDims.end(), dataTypeSize, std::multiplies()); } else { - jcp.dynamicBatch = srcDataDims[0] == Shape::UNDEFINED_DIM; - jcp.batchNum = jcp.dynamicBatch ? 1lu : srcDataDims[0]; + jcp.dynamicBatch = srcDataDims[0] == Shape::UNDEFINED_DIM; + jcp.batchNum = jcp.dynamicBatch ? 1lu : srcDataDims[0]; jcp.dynamicChannel = srcDataDims[1] == Shape::UNDEFINED_DIM; - jcp.cannelNum = jcp.dynamicChannel ? 1lu : srcDataDims[1]; + jcp.cannelNum = jcp.dynamicChannel ? 1lu : srcDataDims[1]; } if (x64::mayiuse(x64::avx512_core)) { @@ -195,7 +196,7 @@ void GridSample::prepareParams() { const uint64_t dataElPerVec = jitKernel->getDataElPerVec(); const auto& srcDataShape = dataMemPtr->getStaticDims(); - const auto& dstShape = dstMemPtr->getStaticDims(); + const auto& dstShape = dstMemPtr->getStaticDims(); const uint64_t totalWork = dstShape[2] * dstShape[3]; const uint64_t wpt = ((totalWork / dataElPerVec) / m_threads_num + 1) * dataElPerVec; @@ -210,26 +211,27 @@ void GridSample::prepareParams() { return; } - p.batchNum = srcDataShape[0]; - p.channelsNum = srcDataShape[1]; + p.batchNum = srcDataShape[0]; + p.channelsNum = srcDataShape[1]; p.srcHeightF[0] = srcDataShape[2]; - p.srcWidthF[0] = srcDataShape[3]; + p.srcWidthF[0] = srcDataShape[3]; p.gridStartB = dstStart * 2 * gridTypeSize; - p.dstStartB = dstStart * dataTypeSize; + p.dstStartB = dstStart * dataTypeSize; - p.srcBatchStepB = std::accumulate(srcDataShape.begin() + 1, srcDataShape.end(), dataTypeSize, std::multiplies()); + p.srcBatchStepB = + std::accumulate(srcDataShape.begin() + 1, srcDataShape.end(), dataTypeSize, std::multiplies()); p.gridBatchStepB = (dstShape[2] * dstShape[3] - p.workAmount) * 2 * gridTypeSize; - p.dstBatchStepB = (dstShape[1] * dstShape[2] * dstShape[3] - p.workAmount) * dataTypeSize; + p.dstBatchStepB = (dstShape[1] * dstShape[2] * dstShape[3] - p.workAmount) * dataTypeSize; p.srcChannelStepB = srcDataShape[2] * srcDataShape[3] * dataTypeSize; p.dstChannelStepB = dstShape[2] * dstShape[3] * dataTypeSize; p.dataTypeSize[0] = dataTypeSize; p.srcHeightSub1F[0] = p.srcHeightF[0] - 1.f; - p.srcWidthSub1F[0] = p.srcWidthF[0] - 1.f; + p.srcWidthSub1F[0] = p.srcWidthF[0] - 1.f; p.srcHeightMul2F[0] = p.srcHeightF[0] * 2.f; - p.srcWidthMul2F[0] = p.srcWidthF[0] * 2.f; + p.srcWidthMul2F[0] = p.srcWidthF[0] * 2.f; if (interpolationMode == GridSampleInterpolationMode::BICUBIC && srcDataShape[3] >= 4) { p.srcWidthB[0] = (srcDataShape[3] - 3) * dataTypeSize; } else { @@ -237,24 +239,24 @@ void GridSample::prepareParams() { } if (alignCorners) { p.srcHeightMul2Sub1F[0] = p.srcHeightF[0] == 1.f ? 1.f : p.srcHeightSub1F[0] * 2.f; - p.srcWidthMul2Sub1F[0] = p.srcWidthF[0] == 1.f ? 1.f : p.srcWidthSub1F[0] * 2.f; - p.wDenormCoefF[0] = (p.srcWidthF[0] - 1.f) / 2.f; + p.srcWidthMul2Sub1F[0] = p.srcWidthF[0] == 1.f ? 1.f : p.srcWidthSub1F[0] * 2.f; + p.wDenormCoefF[0] = (p.srcWidthF[0] - 1.f) / 2.f; p.hDenormCoefF[0] = (p.srcHeightF[0] - 1.f) / 2.f; } else { p.srcHeightMul2Sub1F[0] = p.srcHeightMul2F[0] - 1.f; - p.srcWidthMul2Sub1F[0] = p.srcWidthMul2F[0] - 1.f; + p.srcWidthMul2Sub1F[0] = p.srcWidthMul2F[0] - 1.f; } if (!x64::mayiuse(x64::avx512_core)) { - std::fill(p.srcHeightF.begin(), p.srcHeightF.end(), p.srcHeightF[0]); - std::fill(p.srcWidthF.begin(), p.srcWidthF.end(), p.srcWidthF[0]); - std::fill(p.dataTypeSize.begin(), p.dataTypeSize.end(), p.dataTypeSize[0]); - std::fill(p.srcHeightSub1F.begin(), p.srcHeightSub1F.end(), p.srcHeightSub1F[0]); - std::fill(p.srcWidthSub1F.begin(), p.srcWidthSub1F.end(), p.srcWidthSub1F[0]); - std::fill(p.srcHeightMul2F.begin(), p.srcHeightMul2F.end(), p.srcHeightMul2F[0]); - std::fill(p.srcWidthMul2F.begin(), p.srcWidthMul2F.end(), p.srcWidthMul2F[0]); - std::fill(p.srcWidthB.begin(), p.srcWidthB.end(), p.srcWidthB[0]); + std::fill(p.srcHeightF.begin(), p.srcHeightF.end(), p.srcHeightF[0]); + std::fill(p.srcWidthF.begin(), p.srcWidthF.end(), p.srcWidthF[0]); + std::fill(p.dataTypeSize.begin(), p.dataTypeSize.end(), p.dataTypeSize[0]); + std::fill(p.srcHeightSub1F.begin(), p.srcHeightSub1F.end(), p.srcHeightSub1F[0]); + std::fill(p.srcWidthSub1F.begin(), p.srcWidthSub1F.end(), p.srcWidthSub1F[0]); + std::fill(p.srcHeightMul2F.begin(), p.srcHeightMul2F.end(), p.srcHeightMul2F[0]); + std::fill(p.srcWidthMul2F.begin(), p.srcWidthMul2F.end(), p.srcWidthMul2F[0]); + std::fill(p.srcWidthB.begin(), p.srcWidthB.end(), p.srcWidthB[0]); std::fill(p.srcHeightMul2Sub1F.begin(), p.srcHeightMul2Sub1F.end(), p.srcHeightMul2Sub1F[0]); - std::fill(p.srcWidthMul2Sub1F.begin(), p.srcWidthMul2Sub1F.end(), p.srcWidthMul2Sub1F[0]); + std::fill(p.srcWidthMul2Sub1F.begin(), p.srcWidthMul2Sub1F.end(), p.srcWidthMul2Sub1F[0]); if (alignCorners) { std::fill(p.wDenormCoefF.begin(), p.wDenormCoefF.end(), p.wDenormCoefF[0]); std::fill(p.hDenormCoefF.begin(), p.hDenormCoefF.end(), p.hDenormCoefF[0]); @@ -264,9 +266,9 @@ void GridSample::prepareParams() { } void GridSample::execute(dnnl::stream strm) { - const void* srcData = getSrcDataAtPort(IN_DATA); + const void* srcData = getSrcDataAtPort(IN_DATA); const uint8_t* gridData = getSrcDataAtPortAs(IN_GRID); - uint8_t* dstData = getDstDataAtPortAs(0); + uint8_t* dstData = getDstDataAtPortAs(0); auto threadBody = [&](const int ithr, const int nthr) { const auto& p = execParamsPerThread[ithr]; @@ -275,30 +277,30 @@ void GridSample::execute(dnnl::stream strm) { return; } - arg.src = srcData; - arg.grid = gridData + p.gridStartB; - arg.dst = dstData + p.dstStartB; - arg.batchNum = p.batchNum; - arg.channelsNum = p.channelsNum; - arg.srcHeightF = p.srcHeightF.data(); - arg.srcWidthF = p.srcWidthF.data(); - arg.srcWidthB = p.srcWidthB.data(); - arg.srcChannelStepB = p.srcChannelStepB; - arg.dstChannelStepB = p.dstChannelStepB; - arg.srcBatchStepB = p.srcBatchStepB; - arg.gridBatchStepB = p.gridBatchStepB; - arg.dstBatchStepB = p.dstBatchStepB; - arg.srcHeightSub1F = p.srcHeightSub1F.data(); - arg.srcWidthSub1F = p.srcWidthSub1F.data(); - arg.srcWidthMul2F = p.srcWidthMul2F.data(); - arg.srcHeightMul2F = p.srcHeightMul2F.data(); + arg.src = srcData; + arg.grid = gridData + p.gridStartB; + arg.dst = dstData + p.dstStartB; + arg.batchNum = p.batchNum; + arg.channelsNum = p.channelsNum; + arg.srcHeightF = p.srcHeightF.data(); + arg.srcWidthF = p.srcWidthF.data(); + arg.srcWidthB = p.srcWidthB.data(); + arg.srcChannelStepB = p.srcChannelStepB; + arg.dstChannelStepB = p.dstChannelStepB; + arg.srcBatchStepB = p.srcBatchStepB; + arg.gridBatchStepB = p.gridBatchStepB; + arg.dstBatchStepB = p.dstBatchStepB; + arg.srcHeightSub1F = p.srcHeightSub1F.data(); + arg.srcWidthSub1F = p.srcWidthSub1F.data(); + arg.srcWidthMul2F = p.srcWidthMul2F.data(); + arg.srcHeightMul2F = p.srcHeightMul2F.data(); arg.srcHeightMul2Sub1F = p.srcHeightMul2Sub1F.data(); - arg.srcWidthMul2Sub1F = p.srcWidthMul2Sub1F.data(); - arg.wDenormCoefF = p.wDenormCoefF.data(); - arg.hDenormCoefF = p.hDenormCoefF.data(); - arg.dataTypeSize = p.dataTypeSize.data(); - arg.buffer = p.buffer.data(); - arg.workAmount = p.workAmount; + arg.srcWidthMul2Sub1F = p.srcWidthMul2Sub1F.data(); + arg.wDenormCoefF = p.wDenormCoefF.data(); + arg.hDenormCoefF = p.hDenormCoefF.data(); + arg.dataTypeSize = p.dataTypeSize.data(); + arg.buffer = p.buffer.data(); + arg.workAmount = p.workAmount; (*jitKernel)(&arg); }; @@ -314,4 +316,4 @@ bool GridSample::created() const { return getType() == Type::GridSample; } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 diff --git a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp index b4468d58be9b52..eb4fd38b64c878 100644 --- a/src/plugins/intel_cpu/src/nodes/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/grid_sample.hpp @@ -5,6 +5,7 @@ #pragma once #include + #include "kernels/x64/grid_sample.hpp" namespace ov { @@ -16,35 +17,35 @@ class GridSample : public Node { GridSample(const std::shared_ptr& op, const GraphContext::CPtr context); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void createPrimitive() override; void execute(dnnl::stream strm) override; bool created() const override; struct threadExecParams { - uint64_t batchNum = 1lu; + uint64_t batchNum = 1lu; uint64_t channelsNum = 1lu; - std::vector srcHeightF{ 1.f }; - std::vector srcWidthF{ 1.f }; - std::vector srcWidthB{ 1lu }; - std::vector dataTypeSize{ 1lu }; - std::vector srcHeightMul2F{ 1.f }; - std::vector srcWidthMul2F{ 1.f }; - std::vector srcHeightMul2Sub1F{ 1.f }; - std::vector srcWidthMul2Sub1F{ 1.f }; - std::vector srcHeightSub1F{ 1.f }; - std::vector srcWidthSub1F{ 1.f }; - std::vector wDenormCoefF{ 1.f }; - std::vector hDenormCoefF{ 1.f }; - uint64_t gridStartB = 0lu; - uint64_t dstStartB = 0lu; + std::vector srcHeightF{1.f}; + std::vector srcWidthF{1.f}; + std::vector srcWidthB{1lu}; + std::vector dataTypeSize{1lu}; + std::vector srcHeightMul2F{1.f}; + std::vector srcWidthMul2F{1.f}; + std::vector srcHeightMul2Sub1F{1.f}; + std::vector srcWidthMul2Sub1F{1.f}; + std::vector srcHeightSub1F{1.f}; + std::vector srcWidthSub1F{1.f}; + std::vector wDenormCoefF{1.f}; + std::vector hDenormCoefF{1.f}; + uint64_t gridStartB = 0lu; + uint64_t dstStartB = 0lu; uint64_t srcChannelStepB = 0lu; uint64_t dstChannelStepB = 0lu; - uint64_t srcBatchStepB = 0lu; - uint64_t gridBatchStepB = 0lu; - uint64_t dstBatchStepB = 0lu; - uint64_t workAmount = 0lu; + uint64_t srcBatchStepB = 0lu; + uint64_t gridBatchStepB = 0lu; + uint64_t dstBatchStepB = 0lu; + uint64_t workAmount = 0lu; std::vector buffer; }; @@ -71,6 +72,6 @@ class GridSample : public Node { std::shared_ptr jitKernel; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/grn.cpp b/src/plugins/intel_cpu/src/nodes/grn.cpp index 10de2ef2286f0f..374452812eaf3a 100644 --- a/src/plugins/intel_cpu/src/nodes/grn.cpp +++ b/src/plugins/intel_cpu/src/nodes/grn.cpp @@ -2,11 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "grn.h" + #include -#include "openvino/opsets/opset1.hpp" #include "openvino/core/parallel.hpp" -#include "grn.h" +#include "openvino/opsets/opset1.hpp" namespace ov { namespace intel_cpu { @@ -97,11 +98,12 @@ void GRN::execute(dnnl::stream strm) { parallel_for3d(N, H, W, [&](int b, int h, int w) { double variance = 0; for (int c = 0; c < C; c++) { - variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2); + variance += std::pow(src_data[b * C * H * W + c * H * W + h * W + w], 2); } variance = std::pow(variance + bias, 0.5f); for (int c = 0; c < C; c++) { - dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast(variance); + dst_data[b * C * H * W + c * H * W + h * W + w] = + src_data[b * C * H * W + c * H * W + h * W + w] / static_cast(variance); } }); } @@ -110,6 +112,6 @@ bool GRN::created() const { return getType() == Type::GRN; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/grn.h b/src/plugins/intel_cpu/src/nodes/grn.h index 52e77318e2132f..17eac4e81b9d6c 100644 --- a/src/plugins/intel_cpu/src/nodes/grn.h +++ b/src/plugins/intel_cpu/src/nodes/grn.h @@ -14,7 +14,7 @@ class GRN : public Node { public: GRN(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -34,6 +34,6 @@ class GRN : public Node { std::string errorPrefix; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/if.cpp b/src/plugins/intel_cpu/src/nodes/if.cpp index 1b6102ff954689..8de1cf14920d74 100644 --- a/src/plugins/intel_cpu/src/nodes/if.cpp +++ b/src/plugins/intel_cpu/src/nodes/if.cpp @@ -4,22 +4,22 @@ #include "if.h" -#include "openvino/op/if.hpp" +#include +#include #include "common/cpu_memcpy.h" -#include "shape_inference/shape_inference_internal_dyn.hpp" #include "nodes/common/cpu_convert.h" +#include "openvino/op/if.hpp" +#include "shape_inference/shape_inference_internal_dyn.hpp" #include "transformations/utils/utils.hpp" -#include -#include - namespace ov { namespace intel_cpu { namespace node { -If::PortMapHelper::PortMapHelper(const MemoryPtr &from, const std::deque& to, - const dnnl::engine& eng) : srcMemPtr(from), dstMemPtrs(to) { +If::PortMapHelper::PortMapHelper(const MemoryPtr& from, const std::deque& to, const dnnl::engine& eng) + : srcMemPtr(from), + dstMemPtrs(to) { size = 0; if (srcMemPtr->getDesc().isDefined()) size = srcMemPtr->getShape().getElementsCount(); @@ -43,7 +43,7 @@ void If::PortMapHelper::execute(dnnl::stream& strm) { } void If::PortMapHelper::redefineTo() { - const auto &currDesc = dstMemPtrs.front()->getDesc(); + const auto& currDesc = dstMemPtrs.front()->getDesc(); if (currDesc.getShape().isDynamic() || currDesc.getShape().getStaticDims() != srcMemPtr->getStaticDims()) { // TODO : check the entire dstMemPtrs usage considering the proper memory sharing auto newShape = srcMemPtr->getStaticDims(); @@ -60,7 +60,7 @@ bool If::isSupportedOperation(const std::shared_ptr& op, std::st try { if (!one_of(op->get_type_info(), ov::op::v8::If::get_type_info_static())) { errorMessage = "Not supported If operation version " + std::string(op->get_type_info().version_id) + - " with name '" + op->get_friendly_name() + "'. Node If supports only opset8 version."; + " with name '" + op->get_friendly_name() + "'. Node If supports only opset8 version."; return false; } } catch (...) { @@ -69,8 +69,9 @@ bool If::isSupportedOperation(const std::shared_ptr& op, std::st return true; } -If::If(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, InternalDynShapeInferFactory()), ovOp(op) { +If::If(const std::shared_ptr& op, const GraphContext::CPtr context) + : Node(op, context, InternalDynShapeInferFactory()), + ovOp(op) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); @@ -111,49 +112,55 @@ void If::getSupportedDescriptors() { } } - const auto &outMapThen = subGraphThen.GetOutputNodesMap(); + const auto& outMapThen = subGraphThen.GetOutputNodesMap(); for (const auto& out : ifOp->get_then_body()->get_results()) { auto outNode = outMapThen.find(ifOp->get_then_body()->get_result_index(out)); if (outNode != outMapThen.end()) { auto outMem = outNode->second->getSrcMemoryAtPort(0); outputMemThen.push_back(outMem); } else { - OPENVINO_THROW("Then body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); + OPENVINO_THROW("Then body of node If with name ", + getName(), + " does not have output with name: ", + out->get_friendly_name()); } } - const auto &outMapElse = subGraphElse.GetOutputNodesMap(); + const auto& outMapElse = subGraphElse.GetOutputNodesMap(); for (const auto& out : ifOp->get_else_body()->get_results()) { auto outNode = outMapElse.find(ifOp->get_else_body()->get_result_index(out)); if (outNode != outMapElse.end()) { auto outMem = outNode->second->getSrcMemoryAtPort(0); outputMemElse.push_back(outMem); } else { - OPENVINO_THROW("Else body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); + OPENVINO_THROW("Else body of node If with name ", + getName(), + " does not have output with name: ", + out->get_friendly_name()); } } // Port map: outputs for (const auto& desc : ifOp->get_output_descriptions(0)) { auto body_output_idx = desc->m_body_value_index; - thenOutputPortMap.emplace_back(PortMap { - static_cast(desc->m_output_index), static_cast(body_output_idx)}); + thenOutputPortMap.emplace_back( + PortMap{static_cast(desc->m_output_index), static_cast(body_output_idx)}); } for (const auto& desc : ifOp->get_output_descriptions(1)) { auto body_output_idx = desc->m_body_value_index; - elseOutputPortMap.emplace_back(PortMap { - static_cast(desc->m_output_index), static_cast(body_output_idx)}); + elseOutputPortMap.emplace_back( + PortMap{static_cast(desc->m_output_index), static_cast(body_output_idx)}); } for (const auto& desc : ifOp->get_input_descriptions(0)) { auto body_input_index = desc->m_body_parameter_index; - thenInputPortMap.emplace_back(PortMap { - static_cast(desc->m_input_index), static_cast(body_input_index)}); + thenInputPortMap.emplace_back( + PortMap{static_cast(desc->m_input_index), static_cast(body_input_index)}); } for (const auto& desc : ifOp->get_input_descriptions(1)) { auto body_input_index = desc->m_body_parameter_index; - elseInputPortMap.emplace_back(PortMap { - static_cast(desc->m_input_index), static_cast(body_input_index)}); + elseInputPortMap.emplace_back( + PortMap{static_cast(desc->m_input_index), static_cast(body_input_index)}); } } @@ -166,16 +173,17 @@ void If::initSupportedPrimitiveDescriptors() { config.outConfs.reserve(getChildEdges().size()); for (size_t i = 0; i < inputShapes.size(); i++) { - PortConfig dataConf {}; + PortConfig dataConf{}; auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalInputPrecisionAtPort(i), getInputShapeAtPort(i))); config.inConfs.emplace_back(dataConf); } for (size_t i = 0; i < outputShapes.size(); i++) { - PortConfig dataConf {}; + PortConfig dataConf{}; auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); - dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); + dataConf.setMemDesc( + descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); config.outConfs.push_back(dataConf); } @@ -195,9 +203,9 @@ void If::createPrimitive() { } void If::prepareBeforeMappers(const bool isThen, const dnnl::engine& eng) { - auto &inputPortMap = isThen ? thenInputPortMap : elseInputPortMap; - auto &inputMems = isThen ? inputMemThen : inputMemElse; - auto &beforeMappers = isThen ? beforeThenMappers : beforeElseMappers; + auto& inputPortMap = isThen ? thenInputPortMap : elseInputPortMap; + auto& inputMems = isThen ? inputMemThen : inputMemElse; + auto& beforeMappers = isThen ? beforeThenMappers : beforeElseMappers; for (auto& map_rule : inputPortMap) { auto fromMem = getSrcMemoryAtPort(map_rule.from); auto& toMems = inputMems[map_rule.to]; @@ -216,12 +224,12 @@ void If::prepareBeforeMappers(const bool isThen, const dnnl::engine& eng) { } void If::prepareAfterMappers(const bool isThen, const dnnl::engine& eng) { - auto &outputPortMap = isThen ? thenOutputPortMap : elseOutputPortMap; - auto &outputMems = isThen ? outputMemThen : outputMemElse; - auto &afterMappers = isThen ? afterThenMappers : afterElseMappers; + auto& outputPortMap = isThen ? thenOutputPortMap : elseOutputPortMap; + auto& outputMems = isThen ? outputMemThen : outputMemElse; + auto& afterMappers = isThen ? afterThenMappers : afterElseMappers; for (auto& map_rule : outputPortMap) { auto toMems = getToMemories(this, map_rule.from); - auto &fromMem = outputMems[map_rule.to]; + auto& fromMem = outputMems[map_rule.to]; // Check precision between If node input/output and it's subgrapsh input/output. for (const auto& toMem : toMems) { if (fromMem->getDesc().getPrecision() != toMem->getDesc().getPrecision()) { @@ -250,11 +258,11 @@ void If::execute(dnnl::stream strm) { auto& afterMappers = condition ? afterThenMappers : afterElseMappers; auto& subGraph = condition ? subGraphThen : subGraphElse; - for (auto &mapper : beforeMappers) + for (auto& mapper : beforeMappers) mapper->execute(strm); subGraph.ResetInferCount(); subGraph.Infer(); - for (auto &mapper : afterMappers) + for (auto& mapper : afterMappers) mapper->execute(strm); } @@ -266,6 +274,6 @@ bool If::created() const { return getType() == Type::If; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/if.h b/src/plugins/intel_cpu/src/nodes/if.h index f858c92b0b2651..a2babb45b6c803 100644 --- a/src/plugins/intel_cpu/src/nodes/if.h +++ b/src/plugins/intel_cpu/src/nodes/if.h @@ -4,8 +4,8 @@ #pragma once -#include #include +#include #include #include @@ -25,12 +25,18 @@ class If : public Node { void createPrimitive() override; bool created() const override; void execute(dnnl::stream strm) override; - bool isExecutable() const override { return true; } + bool isExecutable() const override { + return true; + } protected: void executeDynamicImpl(dnnl::stream strm) override; - bool needPrepareParams() const override { return false; }; - bool needShapeInfer() const override { return false; } + bool needPrepareParams() const override { + return false; + }; + bool needShapeInfer() const override { + return false; + } private: void prepareBeforeMappers(const bool isThen, const dnnl::engine& eng); @@ -64,21 +70,14 @@ class If : public Node { std::vector> inputMemThen, inputMemElse; std::deque outputMemThen, outputMemElse; - std::vector> - beforeThenMappers, - beforeElseMappers, - afterThenMappers, + std::vector> beforeThenMappers, beforeElseMappers, afterThenMappers, afterElseMappers; - std::vector - thenInputPortMap, - thenOutputPortMap, - elseInputPortMap, - elseOutputPortMap; + std::vector thenInputPortMap, thenOutputPortMap, elseInputPortMap, elseOutputPortMap; const std::shared_ptr ovOp; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index 1f650bd8c5de17..4bb2f714b284fd 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -5,8 +5,11 @@ #include "input.h" #include "cpu/x64/jit_generator.hpp" +#include "memory_desc/cpu_memory_desc_utils.h" #include "nodes/node_config.h" #include "openvino/core/parallel.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/type/element_type.hpp" #include "shape_inference/shape_inference_pass_through.hpp" using namespace dnnl; @@ -35,16 +38,14 @@ struct jit_has_subnormals_base : public jit_generator { } fn_t get() { - return jit_ker() || create_kernel() == dnnl::impl::status::success - ? (fn_t)jit_ker() - : nullptr; + return jit_ker() || create_kernel() == dnnl::impl::status::success ? (fn_t)jit_ker() : nullptr; } protected: - void foreach(const Xbyak::Reg64& idx, - size_t step, - const Xbyak::Reg64& end, - std::function && fn) { + void foreach (const Xbyak::Reg64& idx, + size_t step, + const Xbyak::Reg64& end, + std::function && fn) { Label loop, exit; L(loop); @@ -58,75 +59,76 @@ struct jit_has_subnormals_base : public jit_generator { L(exit); } - void copy_floats(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size) { + void copy_floats(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size) { push(rsi); push(r15); xor_(rsi, rsi); - foreach(rsi, 1, size, [&, this](const Xbyak::Reg64& idx) { + foreach (rsi, 1, size, [&, this](const Xbyak::Reg64& idx) { mov(r15d, dword[src + idx * sizeof(float)]); mov(dword[dst + idx * sizeof(float)], r15d); - }); + }) + ; pop(r15); pop(rsi); } - void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Ymm &exponent_mask, const Xbyak::Ymm &mantissa_mask, const Xbyak::Ymm &zero) { + void check_subnormals(const Xbyak::Reg64& src, + const Xbyak::Ymm& exponent_mask, + const Xbyak::Ymm& mantissa_mask, + const Xbyak::Ymm& zero) { auto a = ymm1; auto b = ymm2; auto c = ymm3; - vmovdqu(a, yword[src]); // load 8 floats - vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111 - vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 - vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000 - vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 - vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 + vmovdqu(a, yword[src]); // load 8 floats + vpand(b, a, mantissa_mask); // b = a & 00000000011111111111111111111111 + vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 + vpand(c, a, exponent_mask); // c = a & 01111111100000000000000000000000 + vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 + vptest(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 } - void check_subnormals(const Xbyak::Reg64& src, const Xbyak::Xmm &exponent_mask, const Xbyak::Xmm &mantissa_mask, const Xbyak::Xmm &zero) { + void check_subnormals(const Xbyak::Reg64& src, + const Xbyak::Xmm& exponent_mask, + const Xbyak::Xmm& mantissa_mask, + const Xbyak::Xmm& zero) { auto a = xmm1; auto b = xmm2; auto c = xmm3; - uni_vmovdqu(a, xword[src]); // load 4 floats - uni_vmovdqu(b, a); // b = a - uni_vmovdqu(c, a); // c = a - uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111 - uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 - uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000 - uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 - uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 + uni_vmovdqu(a, xword[src]); // load 4 floats + uni_vmovdqu(b, a); // b = a + uni_vmovdqu(c, a); // c = a + uni_vpand(b, b, mantissa_mask); // b = a & 00000000011111111111111111111111 + uni_vpcmpeqd(b, b, zero); // if (b == 0) b = 1 else b = 0 + uni_vpand(c, c, exponent_mask); // c = a & 01111111100000000000000000000000 + uni_vpcmpeqd(c, c, zero); // if (c == 0) c = 1 else c = 0 + uni_vtestps(b, c); // if ((!b & c) == 0) CF = 1 else CF = 0 } protected: Label exit, has_subnormals, no_subnormals; - const Reg64 ®_src = rax; - const Reg64 ®_dst = rbx; - const Reg64 ®_sz = rdx; - const Reg64 ®_idx = rsi; - const Reg64 ®_mask_addr = r15; + const Reg64& reg_src = rax; + const Reg64& reg_dst = rbx; + const Reg64& reg_sz = rdx; + const Reg64& reg_idx = rsi; + const Reg64& reg_mask_addr = r15; static const uint32_t exponent_mask_data[8]; static const uint32_t mantissa_mask_data[8]; }; -const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = { - 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, - 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 -}; +const uint32_t jit_has_subnormals_base::exponent_mask_data[8] = + {0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; -const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = { - 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, - 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff -}; +const uint32_t jit_has_subnormals_base::mantissa_mask_data[8] = + {0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff}; -template +template struct jit_has_subnormals : public jit_has_subnormals_base { using Vmm = typename dnnl::impl::utils::conditional::type; @@ -135,7 +137,7 @@ struct jit_has_subnormals : public jit_has_subnormals_base { const Vmm rmm6 = Vmm(6); const int length = isa == sse41 ? 4 : 8; - void generate() override final { // NOLINT + void generate() override final { // NOLINT size_t const vlen = length; const int sh_bits = std::ilogb(vlen); @@ -162,11 +164,12 @@ struct jit_has_subnormals : public jit_has_subnormals_base { mov(r8, reg_sz); shr(r8, sh_bits); - foreach(reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) { + foreach (reg_idx, 1, r8, [&, this](const Xbyak::Reg64& idx) { check_subnormals(reg_src, exponent_mask, mantissa_mask, zero); jnc(has_subnormals); add(reg_src, sizeof(float) * vlen); - }); + }) + ; // Tail shl(reg_idx, sh_bits); @@ -213,11 +216,11 @@ jit_has_subnormals_base::fn_t jit_has_subnormals_function() { return nullptr; } -} // namespace +} // namespace #endif Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, PassThroughShapeInferFactory()) { + : Node(op, context, PassThroughShapeInferFactory()) { if (!one_of(op->get_type_info(), op::v0::Parameter::get_type_info_static(), op::v0::Constant::get_type_info_static(), @@ -228,9 +231,9 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte op->get_type_name(), " with name ", op->get_friendly_name()); - constOp = ov::as_type_ptr(op); - if (constOp) { + if (auto constOp = ov::as_type_ptr(op)) { constant = ConstantType::Const; + m_constOp = constOp; cloneBlobIfRequired(); } else { constant = ConstantType::StrictNoConst; @@ -238,8 +241,14 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte } void Input::cloneBlobIfRequired() { - Shape shape(constOp->get_shape().empty() ? ov::Shape(1, 1) : constOp->get_shape()); - const auto prec = constOp->get_element_type(); + const auto prec = m_constOp->get_element_type(); + + if (prec == ov::element::undefined && shape_size(m_constOp->get_shape()) == 0) { + memoryPtr = MemoryDescUtils::makeEmptyMemory(context); + return; + } + + Shape shape(m_constOp->get_shape().empty() ? ov::Shape(1, 1) : m_constOp->get_shape()); const size_t size = shape.getElementsCount(); CpuBlockedMemoryDesc memDesc(prec, shape); @@ -251,28 +260,29 @@ void Input::cloneBlobIfRequired() { needFlushDenormalsToZero = false; } - auto cloneBlob = [&, this] () { + auto cloneBlob = [&, this]() { MemoryPtr memory; // CVS-74980 // oneDNN always allocate 1byte for element type with bitWidth < 8 (u4,u1...) // but ngraph Constant uses actual bitWidth for data storage allocation // in that case we make a copy to avoid overflow - if (constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { - if (constOp->get_element_type() == element::string) { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + if (m_constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { + if (m_constOp->get_element_type() == element::string) { + memory = + std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } else { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + memory = std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } } else { - if (constOp->get_element_type() == element::string) { + if (m_constOp->get_element_type() == element::string) { memory = std::make_shared(getEngine(), memDesc); - auto src = constOp->get_data_ptr(); + auto src = m_constOp->get_data_ptr(); auto dst = memory->getDataAs(); std::copy(src, src + size, dst); } else { memory = std::make_shared(getEngine(), memDesc); - memcpy(memory->getData(), constOp->get_data_ptr(), constOp->get_byte_size()); + memcpy(memory->getData(), m_constOp->get_data_ptr(), m_constOp->get_byte_size()); } } @@ -287,22 +297,22 @@ void Input::cloneBlobIfRequired() { return ptr; }; - auto isBlobAligned = [&] () { - bool blobAlignedOnSSE = true; + auto isBlobAligned = [](const std::shared_ptr& constant) { #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // Majority of arithmetic and data processing instructions in legacy SSE isa requires // the memory address in the operands must be aligned on 16-byte boundary. To ensure // safely reusing ngraph const blob memory, need to check address alignment. - const void *ptr = constOp->get_data_ptr(); - blobAlignedOnSSE = mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast(ptr) & 15) == 0); + const void* ptr = constant->get_data_ptr(); + return mayiuse(cpu_isa_t::avx2) || ((reinterpret_cast(ptr) & 15) == 0); +#else + return true; #endif - return blobAlignedOnSSE; }; // The presence of subnormals is better to determined at IR read time. - auto hasSubnormals = [&] () { + auto hasSubnormals = [&]() { if (prec == ov::element::f32) { - uint32_t const *u32data = constOp->get_data_ptr(); + uint32_t const* u32data = m_constOp->get_data_ptr(); if (!size) return false; @@ -316,11 +326,9 @@ void Input::cloneBlobIfRequired() { parallel_for(iterations_num, [&](int n) { auto ptr = u32data + n * batch_size; - const jit_has_subnormals_base::args_t args = { - reinterpret_cast(ptr), - std::min(batch_size, (size_t)(u32data + size - ptr)), - false - }; + const jit_has_subnormals_base::args_t args = {reinterpret_cast(ptr), + std::min(batch_size, (size_t)(u32data + size - ptr)), + false}; fn(&args); @@ -343,12 +351,10 @@ void Input::cloneBlobIfRequired() { return false; }; - auto blobKey = [&] () { + auto blobKey = [&]() { char ptr[32]; - snprintf(ptr, sizeof ptr, "%p", constOp->get_data_ptr()); - return getName() - + "_" + std::to_string(size * prec.size()) - + "_" + ptr; + snprintf(ptr, sizeof ptr, "%p", m_constOp->get_data_ptr()); + return getName() + "_" + std::to_string(size * prec.size()) + "_" + ptr; }; const auto weightCache = context->getWeightsCache(); @@ -356,39 +362,37 @@ void Input::cloneBlobIfRequired() { prec != element::string && // IRs already have all subnormals flushed to zero, but in // read_model scenario with directly loaded original model still can have subnormals - isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) && + isBlobAligned(m_constOp) && (!needFlushDenormalsToZero || !hasSubnormals()) && // Blob should be cloned in cache only if original weights are stored on other numa node. // This is possible only in multistream case on multisocket machine. - // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where original weights are stored. + // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where + // original weights are stored. (!weightCache || context->getNumNumaNodes() == 1 || context->getCPUStreamExecutor()->get_streams_num() == 1); - memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()) + + memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()) : std::const_pointer_cast( weightCache ? *weightCache->findOrCreate(blobKey(), cloneBlob) : cloneBlob()); } -static std::vector createInputShapes(const Shape& shape, - const Type type) { +static std::vector createInputShapes(const Shape& shape, const Type type) { if (type == Type::Output) return {shape}; return {}; } -static std::vector createOutputShapes(const Shape& shape, - const Type type) { +static std::vector createOutputShapes(const Shape& shape, const Type type) { if (type == Type::Input) return {shape}; return {}; } -static std::vector createInputPrecisions(const ov::element::Type& prc, - const Type type) { +static std::vector createInputPrecisions(const ov::element::Type& prc, const Type type) { if (type == Type::Output) return {prc}; return {}; } -static std::vector createOutputPrecisions(const ov::element::Type& prc, - const Type type) { +static std::vector createOutputPrecisions(const ov::element::Type& prc, const Type type) { if (type == Type::Input) return {prc}; return {}; @@ -418,17 +422,13 @@ Input::Input(MemoryDescPtr memDesc, const std::string& name, const std::string& extMemDesc = memDesc; } -Input::Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - InputConfig config) +Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context, InputConfig config) : Input(op, context) { extMemDesc = config.desc; m_isInPlace = config.inPlace; } -Input::Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - OutputConfig config) +Input::Input(const std::shared_ptr& op, const GraphContext::CPtr context, OutputConfig config) : Input(op, context) { extMemDesc = config.desc; m_useParentMemoryDescForOutput = config.useParentMemoryDescForOutput; @@ -489,17 +489,23 @@ void Input::createPrimitive() { for (size_t i = 0; i < getChildEdges().size(); i++) { auto dstMemPtr = getDstMemoryAtPort(i); if (!dstMemPtr) - THROW_CPU_NODE_ERR("has null memory object at port ", i, - " to node ", getChildEdgeAt(i)->getChild()->getName(), "."); + THROW_CPU_NODE_ERR("has null memory object at port ", + i, + " to node ", + getChildEdgeAt(i)->getChild()->getName(), + "."); } for (size_t i = 0; i < getParentEdges().size(); i++) { auto srcMemPtr = getSrcMemoryAtPort(i); if (!srcMemPtr) - THROW_CPU_NODE_ERR("has null memory object at port ", i, - " from node ", getParentEdgeAt(i)->getParent()->getName(), "."); + THROW_CPU_NODE_ERR("has null memory object at port ", + i, + " from node ", + getParentEdgeAt(i)->getParent()->getName(), + "."); } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) THROW_CPU_NODE_ERR("doesn't have selected primitive descriptor."); } @@ -525,9 +531,7 @@ void Input::initSupportedPdDefault() { inPortConfs.push_back({LayoutType::ncsp, precision}); } - addSupportedPrimDesc(inPortConfs, - outPortConfs, - impl_desc_type::unknown); + addSupportedPrimDesc(inPortConfs, outPortConfs, impl_desc_type::unknown); } void Input::initSupportedPdFromMemDesc() { @@ -543,6 +547,6 @@ void Input::initSupportedPdFromMemDesc() { supportedPrimitiveDescriptors.emplace_back(std::move(config), impl_desc_type::unknown); } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 4d7febb17ad4b7..6d1f4c27238540 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -5,6 +5,7 @@ #pragma once #include + #include namespace ov { @@ -42,13 +43,9 @@ class Input : public Node { Input(MemoryDescPtr memDesc, const std::string& name, const std::string& type, const GraphContext::CPtr context); - Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - InputConfig config); + Input(const std::shared_ptr& op, const GraphContext::CPtr context, InputConfig config); - Input(const std::shared_ptr& op, - const GraphContext::CPtr context, - OutputConfig config); + Input(const std::shared_ptr& op, const GraphContext::CPtr context, OutputConfig config); void getSupportedDescriptors() override; void initSupportedPrimitiveDescriptors() override; @@ -66,8 +63,12 @@ class Input : public Node { return false; } - bool needShapeInfer() const override { return false; } - bool needPrepareParams() const override { return false; } + bool needShapeInfer() const override { + return false; + } + bool needPrepareParams() const override { + return false; + } private: void cloneBlobIfRequired(); @@ -75,7 +76,7 @@ class Input : public Node { void initSupportedPdFromMemDesc(); private: - std::shared_ptr constOp; + std::shared_ptr m_constOp; MemoryCPtr memoryPtr; bool isMeanImage = false; MemoryDescPtr extMemDesc = nullptr; @@ -83,6 +84,6 @@ class Input : public Node { bool m_isInPlace = false; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interaction.cpp b/src/plugins/intel_cpu/src/nodes/interaction.cpp index 5ec48e7a263272..905724c3bc829a 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.cpp +++ b/src/plugins/intel_cpu/src/nodes/interaction.cpp @@ -4,7 +4,10 @@ #include "interaction.h" -#include "transformations/cpu_opset/x64/op/interaction.hpp" +#include +#include +#include + #include "common/bfloat16.hpp" #include "common/cpu_memcpy.h" #include "cpu/x64/cpu_isa_traits.hpp" @@ -16,10 +19,7 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" #include "nodes/common/cpu_convert.h" #include "onednn/dnnl.h" - -#include -#include -#include +#include "transformations/cpu_opset/x64/op/interaction.hpp" using namespace dnnl::impl::cpu::x64; using namespace Xbyak; @@ -36,7 +36,9 @@ template struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_move_scale_kernel) - explicit jit_move_scale_kernel(const jit_move_scale_compile_params& jcp) : jit_uni_move_scale_kernel(jcp), jit_generator(jit_name()) { + explicit jit_move_scale_kernel(const jit_move_scale_compile_params& jcp) + : jit_uni_move_scale_kernel(jcp), + jit_generator(jit_name()) { runtime_prc = jcp_.src_prc == ov::element::bf16 ? ov::element::bf16 : ov::element::f32; if (jcp_.dst_prc == ov::element::i8 || jcp_.dst_prc == ov::element::u8) runtime_prc = ov::element::f32; @@ -50,12 +52,13 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene } private: - using Vmm = typename dnnl::impl::utils::conditional3::type; + using Vmm = + typename dnnl::impl::utils::conditional3::type; void generate() override { this->preamble(); -#define GET_OFF(field) offsetof(jit_move_scale_call_args, field) +# define GET_OFF(field) offsetof(jit_move_scale_call_args, field) mov(reg_in, ptr[reg_params + GET_OFF(p_in)]); mov(reg_out, ptr[reg_params + GET_OFF(p_out)]); mov(reg_work_amount, jcp_.input_size); @@ -107,7 +110,7 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene if (jcp_.with_scales) { if (!jcp_.broadcast_scales) { load(vmm_scales, reg_scales, ov::element::f32, ov::element::f32, step, false); - add(reg_scales, sizeof(float) * step); + add(reg_scales, sizeof(float) * step); } uni_vmulps(vmm_in, vmm_in, vmm_scales); } @@ -119,25 +122,39 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene add(reg_out_aux, jcp_.dst_prc.size() * step); } } -#undef GET_OFF - - inline void load(const Vmm& vmm_dst, const Xbyak::Reg64& reg_src, ov::element::Type src_prc, ov::element::Type dst_prc, const int& elt_num, bool fill) { +# undef GET_OFF + + inline void load(const Vmm& vmm_dst, + const Xbyak::Reg64& reg_src, + ov::element::Type src_prc, + ov::element::Type dst_prc, + const int& elt_num, + bool fill) { const auto seed = load_emitter_params(src_prc, dst_prc, elt_num, fill, "float_min").hash(); if (!emitters[seed]) { - emitters[seed].reset(new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num, src_prc, fill, "float_min")); + emitters[seed].reset( + new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num, src_prc, fill, "float_min")); } - emitters[seed]->emit_code({static_cast(reg_src.getIdx()), 0}, {static_cast(vmm_dst.getIdx())}, - pool_aux_vmm_idxs, pool_aux_gpr_idxs); + emitters[seed]->emit_code({static_cast(reg_src.getIdx()), 0}, + {static_cast(vmm_dst.getIdx())}, + pool_aux_vmm_idxs, + pool_aux_gpr_idxs); } - inline void store(const Xbyak::Reg64& reg_dst, const Vmm& vmm_src, ov::element::Type src_prc, ov::element::Type dst_prc, const int& elt_num) { + inline void store(const Xbyak::Reg64& reg_dst, + const Vmm& vmm_src, + ov::element::Type src_prc, + ov::element::Type dst_prc, + const int& elt_num) { const auto seed = store_emitter_params(src_prc, dst_prc, elt_num).hash(); if (!emitters[seed]) { emitters[seed].reset(new jit_store_emitter(this, isa, src_prc, dst_prc, elt_num)); } - emitters[seed]->emit_code({static_cast(vmm_src.getIdx())}, {static_cast(reg_dst.getIdx())}, - pool_aux_vmm_idxs, pool_aux_gpr_idxs); + emitters[seed]->emit_code({static_cast(vmm_src.getIdx())}, + {static_cast(reg_dst.getIdx())}, + pool_aux_vmm_idxs, + pool_aux_gpr_idxs); } size_t vec_size; @@ -155,13 +172,14 @@ struct jit_move_scale_kernel : public jit_uni_move_scale_kernel, public jit_gene Reg64 reg_work_amount = r14; Reg64 reg_params = abi_param1; - const std::vector pool_aux_gpr_idxs = { static_cast(rsi.getIdx()), static_cast(rbp.getIdx()) }; - const std::vector pool_aux_vmm_idxs = { static_cast(xmm_tmp.getIdx()) }; + const std::vector pool_aux_gpr_idxs = {static_cast(rsi.getIdx()), + static_cast(rbp.getIdx())}; + const std::vector pool_aux_vmm_idxs = {static_cast(xmm_tmp.getIdx())}; std::unordered_map> emitters; }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 Interaction::Interaction(const std::shared_ptr& op, const GraphContext::CPtr context) : Node(op, context, NgraphShapeInferFactory(op)) { @@ -174,7 +192,7 @@ Interaction::Interaction(const std::shared_ptr& op, const GraphContext const std::vector& scales = interaction->get_output_scales(); if (!scales.empty()) { fqScales = scales; - outputDataType = interaction->get_output_element_type(0); + outputDataType = interaction->get_output_element_type(0); } } @@ -194,23 +212,12 @@ void Interaction::initSupportedPrimitiveDescriptors() { // initialize input ports std::vector inPortConfigs; for (size_t i = 0; i < getParentEdges().size(); ++i) { - inPortConfigs.emplace_back( - LayoutType::ncsp, - dataPrecision, - getInputShapeAtPort(i), - false, -1); + inPortConfigs.emplace_back(LayoutType::ncsp, dataPrecision, getInputShapeAtPort(i), false, -1); } // initialize output port std::vector outPortConfigs = { - PortConfigurator { - LayoutType::ncsp, - outputDataType, - getOutputShapeAtPort(0), - false, - -1 - } - }; - //add descriptor + PortConfigurator{LayoutType::ncsp, outputDataType, getOutputShapeAtPort(0), false, -1}}; + // add descriptor addSupportedPrimDesc(inPortConfigs, outPortConfigs, impl_desc_type::ref_any); } @@ -221,8 +228,7 @@ static inline void cat(uint8_t* out, size_t elemSize) { size_t offset = 0; for (size_t j = 0; j < feature_sizes.size(); j++) { - cpu_memcpy(out + offset * elemSize, in[j] + bs * feature_sizes[j] * elemSize, - feature_sizes[j] * elemSize); + cpu_memcpy(out + offset * elemSize, in[j] + bs * feature_sizes[j] * elemSize, feature_sizes[j] * elemSize); offset += feature_sizes[j]; } } @@ -303,8 +309,7 @@ void Interaction::prepareParams() { auto matmul_pd = matmul::primitive_desc(getEngine(), src_md, weights_md, dst_md, matmul_attr); prim = matmul(matmul_pd); featureSizes.assign(inputSizes, featureSize); - auto initMemoryPtr = [&](const ov::element::Type& prc, const intel_cpu::Shape& shape, - MemoryPtr& ptr) { + auto initMemoryPtr = [&](const ov::element::Type& prc, const intel_cpu::Shape& shape, MemoryPtr& ptr) { ptr = std::make_shared(getEngine(), intel_cpu::DnnlBlockedMemoryDesc(prc, shape)); }; initMemoryPtr(dataPrecision, intel_cpu::Shape{inputSizes, featureSize}, inputMemPtr); @@ -336,7 +341,7 @@ void Interaction::prepareParams() { moveFeatureKernel.reset(new jit_move_scale_kernel(jcp)); moveInteractKernel.reset(new jit_move_scale_kernel(interJcp)); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 if (moveFeatureKernel && moveInteractKernel) { moveFeatureKernel->create_ker(); @@ -360,8 +365,7 @@ bool Interaction::isExecutable() const { return true; } -bool Interaction::isSupportedOperation(const std::shared_ptr& op, - std::string& errorMessage) noexcept { +bool Interaction::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { const auto interaction = std::dynamic_pointer_cast(op); if (!interaction) { @@ -374,7 +378,6 @@ bool Interaction::isSupportedOperation(const std::shared_ptr& op return true; } - -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interaction.h b/src/plugins/intel_cpu/src/nodes/interaction.h index 448484a2512dd1..794ea0af24a87c 100644 --- a/src/plugins/intel_cpu/src/nodes/interaction.h +++ b/src/plugins/intel_cpu/src/nodes/interaction.h @@ -19,31 +19,31 @@ struct jit_move_scale_compile_params { }; struct jit_move_scale_call_args { - const void *p_in; - void *p_out; - const void *p_scales; + const void* p_in; + void* p_out; + const void* p_scales; }; struct jit_uni_move_scale_kernel { - void (*ker_)(const jit_move_scale_call_args*); + void (*ker_)(const jit_move_scale_call_args*); - void operator()(const jit_move_scale_call_args* call_args) { - assert(ker_); - ker_(call_args); - } + void operator()(const jit_move_scale_call_args* call_args) { + assert(ker_); + ker_(call_args); + } - explicit jit_uni_move_scale_kernel(const jit_move_scale_compile_params& jcp) : ker_(nullptr), jcp_(jcp) {} - virtual ~jit_uni_move_scale_kernel() {} + explicit jit_uni_move_scale_kernel(const jit_move_scale_compile_params& jcp) : ker_(nullptr), jcp_(jcp) {} + virtual ~jit_uni_move_scale_kernel() {} - virtual void create_ker() = 0; + virtual void create_ker() = 0; - jit_move_scale_compile_params jcp_; + jit_move_scale_compile_params jcp_; }; class Interaction : public Node { public: Interaction(const std::shared_ptr& op, const GraphContext::CPtr context); - void getSupportedDescriptors() override {}; + void getSupportedDescriptors() override{}; void initSupportedPrimitiveDescriptors() override; void execute(dnnl::stream strm) override; bool created() const override; @@ -74,6 +74,6 @@ class Interaction : public Node { std::unique_ptr moveInteractKernel; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index 7eed5c1df9789b..beb53cb89a831e 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -4,6 +4,10 @@ #include "interpolate.h" +#include +#include +#include + #include "common/cpu_memcpy.h" #include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp" #include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" @@ -21,16 +25,11 @@ #include "openvino/opsets/opset11.hpp" #include "openvino/opsets/opset4.hpp" #include "shape_inference/shape_inference.hpp" -#include "shape_inference/shape_inference_ngraph.hpp" #include "shape_inference/static_shape.hpp" #include "utils/bfloat16.hpp" #include "utils/cpu_utils.hpp" #include "utils/ngraph_utils.hpp" -#include -#include -#include - using namespace dnnl; using namespace dnnl::impl; @@ -39,7 +38,6 @@ using namespace dnnl::impl::cpu::x64; using namespace dnnl::impl::utils; using namespace Xbyak; - #define GET_OFF(field) offsetof(jit_interpolate_call_args, field) namespace ov { @@ -56,8 +54,9 @@ template struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32) - explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) - : jit_uni_interpolate_kernel(jcp, attr), jit_generator(jit_name()) {} + explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const dnnl_primitive_attr& attr) + : jit_uni_interpolate_kernel(jcp, attr), + jit_generator(jit_name()) {} void create_ker() override { jit_generator::create_kernel(); @@ -70,23 +69,24 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi store_pool_gpr_idxs = {static_cast(reg_tmp_64.getIdx())}; store_pool_vec_idxs = {static_cast(vmm_zero.getIdx())}; - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; for (int i = 0; i < p.len(); i++) { - auto &post_op = p.entry_[i]; + auto& post_op = p.entry_[i]; if (post_op.is_eltwise()) { - eltwise_injectors.push_back(std::make_shared>( - this, - post_op.eltwise.alg, - post_op.eltwise.alpha, - post_op.eltwise.beta, - 1.f)); + eltwise_injectors.push_back(std::make_shared>(this, + post_op.eltwise.alg, + post_op.eltwise.alpha, + post_op.eltwise.beta, + 1.f)); } else if (post_op.is_depthwise()) { - depthwise_injectors.push_back(std::make_shared>( - this, - post_op)); + depthwise_injectors.push_back(std::make_shared>(this, post_op)); } else if (post_op.is_quantization()) { - quantization_injectors.push_back(std::make_shared>( - this, post_op, vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); + quantization_injectors.push_back(std::make_shared>(this, + post_op, + vmm_d_weights, + vmm_d_bias, + reg_d_weights, + reg_d_bias)); } } @@ -99,81 +99,82 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vpxor(vmm_zero, vmm_zero, vmm_zero); switch (jcp_.mode) { - case InterpolateMode::nearest: { - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); - mov(reg_index, ptr[reg_params + GET_OFF(index)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - nn_planar(); - break; - } - case InterpolateLayoutType::block: { - nn_blk(); - break; - } - case InterpolateLayoutType::by_channel: { - nn_by_channel(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with nearest neighbor mode."); - } + case InterpolateMode::nearest: { + mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); + mov(reg_src, ptr[reg_params + GET_OFF(src_ptr[0])]); + mov(reg_index, ptr[reg_params + GET_OFF(index)]); + mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); + + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + nn_planar(); break; } - case InterpolateMode::linear_onnx: { - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - linear_onnx_planar(); - break; - } - case InterpolateLayoutType::block: - case InterpolateLayoutType::by_channel: { - linear_onnx_c_gathered(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with linear_onnx mode."); - } + case InterpolateLayoutType::block: { + nn_blk(); break; } - case InterpolateMode::cubic: { - switch (jcp_.layout) { - case InterpolateLayoutType::planar: { - cubic_planar(); - break; - } - case InterpolateLayoutType::block: - case InterpolateLayoutType::by_channel: { - cubic_c_gathered(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with cubic mode."); - } + case InterpolateLayoutType::by_channel: { + nn_by_channel(); break; } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - switch (jcp_.layout) { - case InterpolateLayoutType::by_channel: { - pillow_by_channel(); - break; - } - default: - assert(!"unsupported memory layout for interpolate layer with bilinear_pillow and bicubic_pillow modes."); - } + default: + assert(!"unsupported memory layout for interpolate layer with nearest neighbor mode."); + } + break; + } + case InterpolateMode::linear_onnx: { + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + linear_onnx_planar(); break; } - case InterpolateMode::linear: { - assert(!"unsupported mode for interpolate layer with JITTED implimentation."); + case InterpolateLayoutType::block: + case InterpolateLayoutType::by_channel: { + linear_onnx_c_gathered(); break; } - default: { - assert(!"unsupported mode for interpolate layer."); + default: + assert(!"unsupported memory layout for interpolate layer with linear_onnx mode."); } + break; + } + case InterpolateMode::cubic: { + switch (jcp_.layout) { + case InterpolateLayoutType::planar: { + cubic_planar(); + break; + } + case InterpolateLayoutType::block: + case InterpolateLayoutType::by_channel: { + cubic_c_gathered(); + break; + } + default: + assert(!"unsupported memory layout for interpolate layer with cubic mode."); + } + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + switch (jcp_.layout) { + case InterpolateLayoutType::by_channel: { + pillow_by_channel(); + break; + } + default: + assert( + !"unsupported memory layout for interpolate layer with bilinear_pillow and bicubic_pillow modes."); + } + break; + } + case InterpolateMode::linear: { + assert(!"unsupported mode for interpolate layer with JITTED implimentation."); + break; + } + default: { + assert(!"unsupported mode for interpolate layer."); + } } this->postamble(); @@ -187,8 +188,8 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } private: - using Vmm = typename conditional3::type; + using Vmm = + typename conditional3::type; const int vlen = cpu_isa_traits::vlen; const int vector_step = vlen / sizeof(float); @@ -217,7 +218,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // for cubic planar Xbyak::Reg64 reg_tbl_y = rsi; Xbyak::Reg64 reg_tbl_x = rbp; - Xbyak::Reg64 reg_table = rdx; // do not need reg_index_offset in this mode, so use rdx + Xbyak::Reg64 reg_table = rdx; // do not need reg_index_offset in this mode, so use rdx Vmm vmm_val = Vmm(1); Vmm vmm_index = Vmm(0); @@ -293,14 +294,21 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi emit_load(reg_src, vmm_src, ov::element::f32, ov::element::f32, elt_num, offset); } - inline void emit_load(Xbyak::Reg64 reg_src, Vmm vmm_src, ov::element::Type src_prc, ov::element::Type dst_prc, const int elt_num, const int offset = 0) { + inline void emit_load(Xbyak::Reg64 reg_src, + Vmm vmm_src, + ov::element::Type src_prc, + ov::element::Type dst_prc, + const int elt_num, + const int offset = 0) { const auto seed = load_emitter_params(src_prc, dst_prc, elt_num).hash(); if (!emitters[seed]) { emitters[seed].reset(new jit_load_emitter(this, isa, src_prc, dst_prc, elt_num)); } emitters[seed]->emit_code({static_cast(reg_src.getIdx()), static_cast(offset)}, - {static_cast(vmm_src.getIdx())}, {}, {load_pool_gpr_idxs}); + {static_cast(vmm_src.getIdx())}, + {}, + {load_pool_gpr_idxs}); } inline void store(Vmm vmm_dst, Xbyak::Reg64 reg_dst, const int elt_num, const int offset = 0) { @@ -310,12 +318,15 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } // for cases when Store emitter need 2 aux vmm we can use vmm_dst as second aux vmm - std::vector local_store_pool_vec_idxs = { static_cast(vmm_dst.getIdx()) }; - local_store_pool_vec_idxs.insert(local_store_pool_vec_idxs.begin(), store_pool_vec_idxs.begin(), store_pool_vec_idxs.end()); + std::vector local_store_pool_vec_idxs = {static_cast(vmm_dst.getIdx())}; + local_store_pool_vec_idxs.insert(local_store_pool_vec_idxs.begin(), + store_pool_vec_idxs.begin(), + store_pool_vec_idxs.end()); emitters[seed]->emit_code({static_cast(vmm_dst.getIdx())}, {static_cast(reg_dst.getIdx()), static_cast(offset)}, - {local_store_pool_vec_idxs}, {store_pool_gpr_idxs}); + {local_store_pool_vec_idxs}, + {store_pool_gpr_idxs}); } // kernel for OH * OW * C @@ -398,9 +409,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } // if int, round if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } - // src_prc, dst_prc and buf ov::element::Type is the same, otherwise need another store with buf(src) precision + // src_prc, dst_prc and buf ov::element::Type is the same, otherwise need another store with + // buf(src) precision store(vmm_dst, reg_dst_aux, vector_step); add(reg_dst_aux, vector_step * jcp_.src_data_size); // advance 8/16 faciliate next block @@ -416,7 +428,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vfmadd231ps(vmm_dst, vmm_val, vmm_weight); } if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } store(vmm_dst, reg_dst_aux, tail_num); add(reg_dst_aux, tail_num * jcp_.src_data_size); @@ -448,7 +460,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vfmadd231ps(vmm_dst, vmm_val, vmm_weight); } if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } store(vmm_dst, reg_dst, vector_step); add(reg_dst, vector_step * jcp_.dst_data_size); @@ -464,7 +476,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi uni_vfmadd231ps(vmm_dst, vmm_val, vmm_weight); } if (!isFloatCompatible(jcp_.src_prc)) { - uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near + uni_vroundps(vmm_dst, vmm_dst, 0x0); // Round near } store(vmm_dst, reg_dst, tail_num); add(reg_dst, tail_num * jcp_.dst_data_size); @@ -496,7 +508,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cmp(reg_work_amount_oh, 1); jl(out_loop_end, T_NEAR); - //reset work_amount to OW + // reset work_amount to OW mov(reg_work_amount, jcp_.OW); Xbyak::Reg64 reg_src_h = rsi; @@ -513,7 +525,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi Xbyak::Label nn_tail_loop_label; Xbyak::Label nn_tail_loop_end_label; - L(nn_loop_label); // inner loop + L(nn_loop_label); // inner loop { cmp(reg_work_amount, vector_step); jl(nn_loop_end_label, T_NEAR); @@ -553,9 +565,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi jmp(nn_tail_loop_label, T_NEAR); } - L(nn_tail_loop_end_label); // inner loop end + L(nn_tail_loop_end_label); // inner loop end - //increment index_h to next row + // increment index_h to next row add(reg_index_h, jcp_.indices_size); sub(reg_work_amount_oh, 1); @@ -621,7 +633,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cmp(reg_work_amount_out, 1); jl(out_loop_end, T_NEAR); - //inner loop for C + // inner loop for C Xbyak::Label nn_loop_label; Xbyak::Label nn_loop_end_label; Xbyak::Label nn_tail_loop_label; @@ -717,10 +729,12 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); int blk = (isa == cpu::x64::sse41) ? (2 * vector_step) : vector_step; - int dst_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.dst_data_size) : - (blk * jcp_.OW * jcp_.OH * jcp_.OD * jcp_.dst_data_size); - int src_stride = (jcp_.layout == InterpolateLayoutType::by_channel) ? (vector_step * jcp_.src_data_size) : - (blk * jcp_.IW * jcp_.IH * jcp_.ID * jcp_.src_data_size); + int dst_stride = (jcp_.layout == InterpolateLayoutType::by_channel) + ? (vector_step * jcp_.dst_data_size) + : (blk * jcp_.OW * jcp_.OH * jcp_.OD * jcp_.dst_data_size); + int src_stride = (jcp_.layout == InterpolateLayoutType::by_channel) + ? (vector_step * jcp_.src_data_size) + : (blk * jcp_.IW * jcp_.IH * jcp_.ID * jcp_.src_data_size); Xbyak::Label main_loop_label; Xbyak::Label main_loop_end_label; @@ -758,8 +772,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // 2d for end depth linear_onnx_worker_2d(); // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -789,8 +805,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // 2d for end depth linear_onnx_worker_2d(); // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -814,9 +832,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi add(reg_src_aux7, src_stride); } if (jcp_.layout == InterpolateLayoutType::by_channel) { - sub(reg_work_amount, vector_step); // work_amount is c + sub(reg_work_amount, vector_step); // work_amount is c } else { - sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails + sub(reg_work_amount, 1); // work_amount = div_up(c, blk), no tails } jmp(main_loop_label, T_NEAR); @@ -844,8 +862,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // 2d for end depth linear_onnx_worker_2d(); // 3th dimension - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -930,8 +950,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi load_weights(reg_src_aux, vmm_weightE, vector_step, 5 * weight_stride); load_weights(reg_src_aux, vmm_weightF, vector_step, 4 * weight_stride); - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -1014,8 +1036,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi load_weights(reg_src_aux, vmm_weightE, scalar_step, 5 * weight_stride); load_weights(reg_src_aux, vmm_weightF, scalar_step, 4 * weight_stride); - uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight - uni_vfmadd231ps(vmm_valTR, vmm_d_bias, vmm_weightF); // start_value * start_weight + end_value * end_weight + uni_vmulps(vmm_valTR, vmm_valTR, vmm_weightE); // end_value * end_weight + uni_vfmadd231ps(vmm_valTR, + vmm_d_bias, + vmm_weightF); // start_value * start_weight + end_value * end_weight } if (attr_.post_ops_.len() != 0) { @@ -1090,7 +1114,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cubic_c_gathered_matrix(false); if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value to post_ops and store + apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value to post_ops and store add(reg_oc_off, vector_step * sizeof(float)); } store(vmm_val, reg_dst, vector_step); @@ -1118,7 +1142,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int src_stride = vector_step * jcp_.src_data_size; add(reg_dst, dst_stride); add(reg_src, src_stride); - sub(reg_work_amount, vector_step); // work_amount is c + sub(reg_work_amount, vector_step); // work_amount is c } else { int dst_stride = blk * jcp_.OW * jcp_.OH * jcp_.dst_data_size; int src_stride = blk * jcp_.IW * jcp_.IH * jcp_.src_data_size; @@ -1143,7 +1167,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi cubic_c_gathered_matrix(true); if (attr_.post_ops_.len() != 0) { - apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value + apply_post_ops(jcp_.dst_prc, false); // vmm_val is default dst value add(reg_oc_off, scalar_step * sizeof(float)); } store(vmm_val, reg_dst, scalar_step); @@ -1152,7 +1176,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int src_stride = scalar_step * jcp_.src_data_size; add(reg_dst, dst_stride); add(reg_src, src_stride); - sub(reg_work_amount, scalar_step); // work_amount is c + sub(reg_work_amount, scalar_step); // work_amount is c jmp(tail_loop_label, T_NEAR); } @@ -1243,7 +1267,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // build weightX used in y0-y3 // weight format: w0_0 w1_0 w2_0 w3_0 w0_1 w1_1 w2_1 w3_1 ... uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); - vgatherdps(vmm_weightX0, ptr[reg_weight_x + vmm_val * grid_len], vmm_mask); // 4 in vmm_val for weight_size, another 4 for grid_len + vgatherdps(vmm_weightX0, + ptr[reg_weight_x + vmm_val * grid_len], + vmm_mask); // 4 in vmm_val for weight_size, another 4 for grid_len uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask); // shift weight_size then gather second weight @@ -1327,8 +1353,20 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi // gather weightX by input idx, used in y0-y3 gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, ov::element::f32, true); gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, ov::element::f32, true); - gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, ov::element::f32, true); - gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, ov::element::f32, true); + gather_i32_indices(vmm_weightX2, + reg_weight_x, + 2 * sizeof(float), + vmm_val, + grid_len, + ov::element::f32, + true); + gather_i32_indices(vmm_weightX3, + reg_weight_x, + 3 * sizeof(float), + vmm_val, + grid_len, + ov::element::f32, + true); // vmm_val is now relieved and used for dst_value uni_vpxor(vmm_val, vmm_val, vmm_val); @@ -1355,7 +1393,13 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); // weight y2 - gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, ov::element::f32, true); + gather_i32_indices(vmm_weightY, + reg_weight_y, + 2 * sizeof(float), + vmm_tbl_y, + grid_len, + ov::element::f32, + true); cubic_planar_line(true); // y3 @@ -1365,7 +1409,13 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1)); vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero); // weight y3 - gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, ov::element::f32, true); + gather_i32_indices(vmm_weightY, + reg_weight_y, + 3 * sizeof(float), + vmm_tbl_y, + grid_len, + ov::element::f32, + true); cubic_planar_line(true); if (attr_.post_ops_.len() != 0) { @@ -1454,8 +1504,13 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } // always gather to Vmm, compute with Vmm, store with Xmm if scalar_step - inline void gather_i32_indices(Vmm vmm_src, const Xbyak::Reg64 &base, int offset, Vmm vmm_indices, int scale, - ov::element::Type src_prc, bool is_scalar) { + inline void gather_i32_indices(Vmm vmm_src, + const Xbyak::Reg64& base, + int offset, + Vmm vmm_indices, + int scale, + ov::element::Type src_prc, + bool is_scalar) { Xbyak::Address table_idx = ptr[base + offset + vmm_indices * scale]; if ((isa == cpu::x64::avx512_core) && !is_scalar) { // [0-15] bit of int to mask @@ -1484,8 +1539,8 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int repeats = is_scalar ? 1 : vlen / sizeof(float); for (int i = 0; i < repeats; ++i) { - mov(reg_tmp_64.cvt32(), ptr[rsp + i * sizeof(int)]); // sizeof(int) index_size - table_idx = ptr[base + offset + reg_tmp_64 * scale]; // scale: sizeof(float) value_size + mov(reg_tmp_64.cvt32(), ptr[rsp + i * sizeof(int)]); // sizeof(int) index_size + table_idx = ptr[base + offset + reg_tmp_64 * scale]; // scale: sizeof(float) value_size mov(reg_tmp_64.cvt32(), table_idx); mov(ptr[rsp + i * sizeof(int)], reg_tmp_64.cvt32()); } @@ -1498,9 +1553,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } } - // is_broadcast for broadcasting param for depth_wise and quantize(channel-sensitive post-ops), for fusion with plain layout. + // is_broadcast for broadcasting param for depth_wise and quantize(channel-sensitive post-ops), for fusion with + // plain layout. void apply_post_ops(ov::element::Type dst_prc, bool is_broadcast) { - const auto &p = attr_.post_ops_; + const auto& p = attr_.post_ops_; int eltwise_inj_idx = 0; int depthwise_inj_idx = 0; int quantization_inj_idx = 0; @@ -1515,8 +1571,11 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi add(reg_d_weights, reg_oc_off); // weight and bias is padded. scalar as vector. - depthwise_injectors[depthwise_inj_idx]->compute_vector_range( - vmm_val.getIdx(), vmm_val.getIdx() + 1, reg_d_weights, reg_d_weights, is_broadcast); + depthwise_injectors[depthwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), + vmm_val.getIdx() + 1, + reg_d_weights, + reg_d_weights, + is_broadcast); post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep(); depthwise_inj_idx++; @@ -1526,15 +1585,25 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi int s_idx = vmm_val.getIdx(); - quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); + quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_post_ops_data + post_ops_data_offset, + reg_oc_off); quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + 1, 0, 0, is_broadcast); - quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); + quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs( + reg_post_ops_data + post_ops_data_offset, + reg_oc_off); + quantization_injectors[quantization_inj_idx] + ->compute_input_scale_shift(s_idx, s_idx + 1, 0, do_rounding, 0, is_broadcast); if (do_dequantization) { - quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(reg_post_ops_data + post_ops_data_offset, reg_oc_off); - quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + 1, 0, 0, is_broadcast); + quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs( + reg_post_ops_data + post_ops_data_offset, + reg_oc_off); + quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, + s_idx + 1, + 0, + 0, + is_broadcast); } post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep(); @@ -1544,7 +1613,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 namespace { struct InterpolateKey { @@ -1586,7 +1655,7 @@ size_t InterpolateKey::hash() const { return seed; } -bool InterpolateKey::operator==(const InterpolateKey &rhs) const { +bool InterpolateKey::operator==(const InterpolateKey& rhs) const { if (nodeAttrs.mode != rhs.nodeAttrs.mode) return false; if (nodeAttrs.coordTransMode != rhs.nodeAttrs.coordTransMode) @@ -1620,7 +1689,7 @@ bool InterpolateKey::operator==(const InterpolateKey &rhs) const { return true; } -} // namespace +} // namespace // shapeND: n c d h w // blockND: ncdhw cdhw dhw hw w 1 @@ -1629,7 +1698,7 @@ inline VectorDims getBlockND(const VectorDims& shape) { int shapeRank = shape.size(); VectorDims blockND(shapeRank + 1, 1); for (int i = shapeRank - 1; i >= 0; i--) { - blockND[i] = shape[i] * blockND[i+1]; + blockND[i] = shape[i] * blockND[i + 1]; } return blockND; } @@ -1665,32 +1734,47 @@ using ngInterpShapeCalcMode = ov::op::v4::Interpolate::ShapeCalcMode; bool Interpolate::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (const auto interp = std::dynamic_pointer_cast(op)) { - const auto &interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; - if (!one_of(interpMode, ngInterpMode::NEAREST, ngInterpMode::LINEAR, ngInterpMode::LINEAR_ONNX, ngInterpMode::CUBIC)) { + const auto& interpAttr = interp->get_attrs(); + const auto& interpMode = interpAttr.mode; + if (!one_of(interpMode, + ngInterpMode::NEAREST, + ngInterpMode::LINEAR, + ngInterpMode::LINEAR_ONNX, + ngInterpMode::CUBIC)) { errorMessage = "Interpolate-4 does not support interpolate mode: " + ov::as_string(interpMode); return false; } - const auto &interpCoordTransMode = interpAttr.coordinate_transformation_mode; - if (!one_of(interpCoordTransMode, ngInterpCoordTransf::HALF_PIXEL, ngInterpCoordTransf::PYTORCH_HALF_PIXEL, ngInterpCoordTransf::ASYMMETRIC, - ngInterpCoordTransf::TF_HALF_PIXEL_FOR_NN, ngInterpCoordTransf::ALIGN_CORNERS)) { - errorMessage = "Interpolate-4 does not support coordinate transformation mode: " + ov::as_string(interpCoordTransMode); + const auto& interpCoordTransMode = interpAttr.coordinate_transformation_mode; + if (!one_of(interpCoordTransMode, + ngInterpCoordTransf::HALF_PIXEL, + ngInterpCoordTransf::PYTORCH_HALF_PIXEL, + ngInterpCoordTransf::ASYMMETRIC, + ngInterpCoordTransf::TF_HALF_PIXEL_FOR_NN, + ngInterpCoordTransf::ALIGN_CORNERS)) { + errorMessage = "Interpolate-4 does not support coordinate transformation mode: " + + ov::as_string(interpCoordTransMode); return false; } if (interpMode == ngInterpMode::NEAREST) { - const auto &interpNearestMode = interpAttr.nearest_mode; - if (!one_of(interpNearestMode, ngInterpNearMode::ROUND_PREFER_FLOOR, ngInterpNearMode::ROUND_PREFER_CEIL, ngInterpNearMode::FLOOR, - ngInterpNearMode::CEIL, ngInterpNearMode::SIMPLE)) { - errorMessage = "Interpolate-4 does not support nearest round mode: " + ov::as_string(interpNearestMode); + const auto& interpNearestMode = interpAttr.nearest_mode; + if (!one_of(interpNearestMode, + ngInterpNearMode::ROUND_PREFER_FLOOR, + ngInterpNearMode::ROUND_PREFER_CEIL, + ngInterpNearMode::FLOOR, + ngInterpNearMode::CEIL, + ngInterpNearMode::SIMPLE)) { + errorMessage = + "Interpolate-4 does not support nearest round mode: " + ov::as_string(interpNearestMode); return false; } } - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (!one_of(interpShapeCalcMode, ngInterpShapeCalcMode::SCALES, ngInterpShapeCalcMode::SIZES)) { - errorMessage = "Interpolate-4 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); + errorMessage = + "Interpolate-4 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); return false; } @@ -1701,7 +1785,8 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr& op } if (dataRank == 5 && interpMode == ngInterpMode::CUBIC) { - errorMessage = "Interpolate-4 doesn't support input tensor with rank: " + std::to_string(dataRank) + " for 'cubic' mode "; + errorMessage = "Interpolate-4 doesn't support input tensor with rank: " + std::to_string(dataRank) + + " for 'cubic' mode "; return false; } @@ -1711,21 +1796,22 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr& op return false; } - if (interp->get_input_size() > 3 && - std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID)) == nullptr) { + if (interp->get_input_size() > 3 && std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(AXES_ID)) == nullptr) { errorMessage = "Only const 'axes' input is supported in Interpolate-4"; return false; } } else if (const auto interp = std::dynamic_pointer_cast(op)) { - const auto &interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; + const auto& interpAttr = interp->get_attrs(); + const auto& interpMode = interpAttr.mode; if (!one_of(interpMode, ngInterpMode::BILINEAR_PILLOW, ngInterpMode::BICUBIC_PILLOW)) { errorMessage = "Interpolate-11 does not support interpolate mode: " + ov::as_string(interpMode); return false; } - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (!one_of(interpShapeCalcMode, ngInterpShapeCalcMode::SCALES, ngInterpShapeCalcMode::SIZES)) { - errorMessage = "Interpolate-11 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); + errorMessage = + "Interpolate-11 does not support shape_calculation_mode: " + ov::as_string(interpShapeCalcMode); return false; } const size_t dataRank = interp->get_input_partial_shape(DATA_ID).rank().get_length(); @@ -1735,12 +1821,12 @@ bool Interpolate::isSupportedOperation(const std::shared_ptr& op return false; } if (!isDynamicNgraphNode(op) && - !ov::is_type(op->get_input_node_ptr(SIZE_OR_SCALE_ID_V11))) { + !ov::is_type(op->get_input_node_ptr(SIZE_OR_SCALE_ID_V11))) { errorMessage = "Only const 'scales_or_sizes' input is supported for static shapes in Interpolate-11"; return false; } - if (interp->get_input_size() > 2 && - std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID_V11)) == nullptr) { + if (interp->get_input_size() > 2 && std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(AXES_ID_V11)) == nullptr) { errorMessage = "Only const 'axes' input is supported in Interpolate-11"; return false; } @@ -1763,19 +1849,14 @@ class InterpolateShapeInferFactory : public ShapeInferFactory { public: InterpolateShapeInferFactory(std::shared_ptr op) : m_op(op) {} ShapeInferPtr makeShapeInfer() const override { - IShapeInfer::port_mask_t port_mask = 0x00; if (auto interp4 = ov::as_type_ptr(m_op)) { - const auto &attr = interp4->get_attrs(); - - if (attr.shape_calculation_mode == ngInterpShapeCalcMode::SCALES) { - port_mask = PortMask(Interpolate::SCALES_ID, Interpolate::AXES_ID); - } else if (attr.shape_calculation_mode == ngInterpShapeCalcMode::SIZES) { - port_mask = PortMask(Interpolate::TARGET_SHAPE_ID, Interpolate::AXES_ID); - } else { - OPENVINO_ASSERT(false, "Unsupported interpolate shape calculation mode"); - } + const auto& attr = interp4->get_attrs(); + const auto is_supported_mode = (attr.shape_calculation_mode == ngInterpShapeCalcMode::SCALES) || + (attr.shape_calculation_mode == ngInterpShapeCalcMode::SIZES); + OPENVINO_ASSERT(is_supported_mode, "Unsupported interpolate shape calculation mode"); + return make_shape_inference(m_op); } else if (auto interp11 = ov::as_type_ptr(m_op)) { - port_mask = PortMask(Interpolate::SIZE_OR_SCALE_ID_V11, Interpolate::AXES_ID_V11); + return make_shape_inference(m_op); } else { OPENVINO_THROW("Shape infer factory cannot be created for ", m_op->get_type_name(), @@ -1783,16 +1864,15 @@ class InterpolateShapeInferFactory : public ShapeInferFactory { m_op->get_friendly_name(), ", only versions 4 and 11 are supported."); } - return std::make_shared(make_shape_inference(m_op), port_mask); } private: std::shared_ptr m_op; }; -} // namespace +} // namespace Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, InterpolateShapeInferFactory(op)) { + : Node(op, context, InterpolateShapeInferFactory(op)) { std::string errorMessage; if (isSupportedOperation(op, errorMessage)) { errorPrefix = "Interpolate node with name '" + getName() + "'"; @@ -1806,9 +1886,9 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext OPENVINO_THROW(errorPrefix, " has incorrect number of output edges"); isAxesSpecified = numInputs != 3; - const auto &interpAttr = interp->get_attrs(); + const auto& interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; + const auto& interpMode = interpAttr.mode; if (interpMode == ngInterpMode::NEAREST) { interpAttrs.mode = InterpolateMode::nearest; } else if (interpMode == ngInterpMode::LINEAR) { @@ -1825,7 +1905,7 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext OPENVINO_THROW(errorPrefix, " has unsupported interpolate mode"); } - const auto &interpCoordTransMode = interpAttr.coordinate_transformation_mode; + const auto& interpCoordTransMode = interpAttr.coordinate_transformation_mode; if (interpCoordTransMode == ngInterpCoordTransf::HALF_PIXEL) { interpAttrs.coordTransMode = InterpolateCoordTransMode::half_pixel; } else if (interpCoordTransMode == ngInterpCoordTransf::PYTORCH_HALF_PIXEL) { @@ -1841,7 +1921,7 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext } if (interpAttrs.mode == InterpolateMode::nearest) { - const auto &interpNearestMode = interpAttr.nearest_mode; + const auto& interpNearestMode = interpAttr.nearest_mode; if (interpNearestMode == ngInterpNearMode::ROUND_PREFER_FLOOR) { interpAttrs.nearestMode = InterpolateNearestMode::round_prefer_floor; } else if (interpNearestMode == ngInterpNearMode::ROUND_PREFER_CEIL) { @@ -1860,7 +1940,7 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext } interpAttrs.antialias = interpAttr.antialias; - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (interpShapeCalcMode == ngInterpShapeCalcMode::SCALES) { interpAttrs.shapeCalcMode = InterpolateShapeCalcMode::scales; } else if (interpShapeCalcMode == ngInterpShapeCalcMode::SIZES) { @@ -1885,14 +1965,16 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext interpAttrs.padEnd[i] = static_cast(interpAttr.pads_end[i]); } - const auto scalesNode = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(SCALES_ID)); + const auto scalesNode = + std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(SCALES_ID)); if (scalesNode) { scales = scalesNode->cast_vector(); isScaleConstant = true; } if (isAxesSpecified) { - axes = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID))->cast_vector(); + axes = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID)) + ->cast_vector(); } else { axes.resize(dataRank); for (int i = 0; i < static_cast(dataRank); i++) { @@ -1908,13 +1990,13 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext OPENVINO_THROW(errorPrefix, " has incorrect number of output edges"); isAxesSpecified = numInputs != 2; - const auto &interpAttr = interp->get_attrs(); - const auto &interpMode = interpAttr.mode; + const auto& interpAttr = interp->get_attrs(); + const auto& interpMode = interpAttr.mode; if (interpMode == ngInterpMode::BILINEAR_PILLOW) { interpAttrs.mode = InterpolateMode::bilinear_pillow; } else if (interpMode == ngInterpMode::BICUBIC_PILLOW) { interpAttrs.mode = InterpolateMode::bicubic_pillow; - interpAttrs.cubeCoeff = static_cast(interpAttr.cube_coeff); // fixed to be -0.5 + interpAttrs.cubeCoeff = static_cast(interpAttr.cube_coeff); // fixed to be -0.5 } else { OPENVINO_THROW(errorPrefix, " has unsupported interpolate mode"); } @@ -1923,10 +2005,11 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext interpAttrs.coordTransMode = InterpolateCoordTransMode::tf_half_pixel_for_nn; interpAttrs.antialias = interpAttr.antialias; - const auto &interpShapeCalcMode = interpAttr.shape_calculation_mode; + const auto& interpShapeCalcMode = interpAttr.shape_calculation_mode; if (interpShapeCalcMode == ngInterpShapeCalcMode::SCALES) { interpAttrs.shapeCalcMode = InterpolateShapeCalcMode::scales; - const auto scalesNode = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(SIZE_OR_SCALE_ID_V11)); + const auto scalesNode = std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(SIZE_OR_SCALE_ID_V11)); if (scalesNode) { scales = scalesNode->cast_vector(); isScaleConstant = true; @@ -1954,7 +2037,9 @@ Interpolate::Interpolate(const std::shared_ptr& op, const GraphContext } if (isAxesSpecified) { - axes = std::dynamic_pointer_cast(interp->get_input_node_shared_ptr(AXES_ID_V11))->cast_vector(); + axes = std::dynamic_pointer_cast( + interp->get_input_node_shared_ptr(AXES_ID_V11)) + ->cast_vector(); if (dataRank == 4 && axes.size() == 2 && axes[0] == 1 && axes[1] == 2 && mayiuse(cpu::x64::sse41)) { NCHWAsNHWC = true; axes[0] = 2; @@ -1993,7 +2078,7 @@ void Interpolate::getSupportedDescriptors() { break; } } - //correct pad + // correct pad if (hasPad) { NCHWAsNHWC = false; auto correctPad = [&](std::vector pad, int rank) { @@ -2071,15 +2156,21 @@ void Interpolate::initSupportedPrimitiveDescriptors() { } } auto& creatorsMap = BlockedDescCreator::getCommonCreators(); - auto pushDesc = [&](LayoutType dataFormat, impl_desc_type implDetail, bool is_version11, bool useAclExecutor = false) { - config.inConfs[DATA_ID].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(inputPrecision, getInputShapeAtPort(DATA_ID))); + auto pushDesc = [&](LayoutType dataFormat, + impl_desc_type implDetail, + bool is_version11, + bool useAclExecutor = false) { + config.inConfs[DATA_ID].setMemDesc( + creatorsMap.at(dataFormat)->createSharedDesc(inputPrecision, getInputShapeAtPort(DATA_ID))); if (is_version11) { if (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::sizes) { config.inConfs[SIZE_OR_SCALE_ID_V11].setMemDesc( - creatorsMap.at(LayoutType::ncsp)->createSharedDesc(targetShapeType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(targetShapeType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); } else { config.inConfs[SIZE_OR_SCALE_ID_V11].setMemDesc( - creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(scalesType, getInputShapeAtPort(SIZE_OR_SCALE_ID_V11))); } if (isAxesSpecified) @@ -2087,14 +2178,18 @@ void Interpolate::initSupportedPrimitiveDescriptors() { creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(AXES_ID_V11))); } else { config.inConfs[TARGET_SHAPE_ID].setMemDesc( - creatorsMap.at(LayoutType::ncsp)->createSharedDesc(targetShapeType, getInputShapeAtPort(TARGET_SHAPE_ID))); - config.inConfs[get_scale_id()].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(get_scale_id()))); + creatorsMap.at(LayoutType::ncsp) + ->createSharedDesc(targetShapeType, getInputShapeAtPort(TARGET_SHAPE_ID))); + config.inConfs[get_scale_id()].setMemDesc( + creatorsMap.at(LayoutType::ncsp)->createSharedDesc(scalesType, getInputShapeAtPort(get_scale_id()))); if (isAxesSpecified) - config.inConfs[get_axis_id()].setMemDesc(creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(get_axis_id()))); + config.inConfs[get_axis_id()].setMemDesc( + creatorsMap.at(LayoutType::ncsp)->createSharedDesc(axesType, getInputShapeAtPort(get_axis_id()))); } - config.outConfs[0].setMemDesc(creatorsMap.at(dataFormat)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0))); + config.outConfs[0].setMemDesc( + creatorsMap.at(dataFormat)->createSharedDesc(outputPrecision, getOutputShapeAtPort(0))); if (useAclExecutor) { std::vector srcMemoryDescs; @@ -2106,8 +2201,11 @@ void Interpolate::initSupportedPrimitiveDescriptors() { dstMemoryDescs.push_back(config.outConfs[i].getMemDesc()); } - auto factory = std::make_shared(interpAttrs, srcMemoryDescs, dstMemoryDescs, - std::make_shared(context, getImplPriority())); + auto factory = std::make_shared( + interpAttrs, + srcMemoryDescs, + dstMemoryDescs, + std::make_shared(context, getImplPriority())); if (!factory->isEmpty()) { supportedPrimitiveDescriptors.push_back({config, implDetail, factory}); } @@ -2116,14 +2214,14 @@ void Interpolate::initSupportedPrimitiveDescriptors() { } }; if (is_version11) { -#if defined (OV_CPU_WITH_ACL) +#if defined(OV_CPU_WITH_ACL) interpAttrs.hasPad = hasPad; pushDesc(LayoutType::nspc, undef, true, true); pushDesc(LayoutType::ncsp, undef, true, true); canUseAclExecutor = !supportedPrimitiveDescriptors.empty(); if (canUseAclExecutor) return; - //fallback to f32 if ref is used + // fallback to f32 if ref is used inputPrecision = outputPrecision = ov::element::f32; #endif @@ -2147,17 +2245,17 @@ void Interpolate::initSupportedPrimitiveDescriptors() { } pushDesc(LayoutType::ncsp, ref, true); } else { - const auto &dataMinDims = getInputShapeAtPort(DATA_ID).getMinDims(); + const auto& dataMinDims = getInputShapeAtPort(DATA_ID).getMinDims(); bool isBlkApplied = dataRank > 1 && dataMinDims[1] != Shape::UNDEFINED_DIM && dataMinDims[1] > 1; -#if defined (OV_CPU_WITH_ACL) +#if defined(OV_CPU_WITH_ACL) interpAttrs.hasPad = hasPad; pushDesc(LayoutType::nspc, undef, false, true); pushDesc(LayoutType::ncsp, undef, false, true); canUseAclExecutor = !supportedPrimitiveDescriptors.empty(); if (canUseAclExecutor) return; - //fallback to f32 if ref is used + // fallback to f32 if ref is used inputPrecision = outputPrecision = ov::element::f32; #endif @@ -2202,7 +2300,7 @@ bool Interpolate::needShapeInfer() const { if (lastScales.empty()) { return true; } - const float *scales = getSrcDataAtPortAs(get_scale_id()); + const float* scales = getSrcDataAtPortAs(get_scale_id()); for (size_t i = 0; i < lastScales.size(); i++) { if (lastScales[i] != scales[i]) { return true; @@ -2212,7 +2310,7 @@ bool Interpolate::needShapeInfer() const { if (lastSizes.empty()) { return true; } - const int32_t *sizes = getSrcDataAtPortAs(TARGET_SHAPE_ID); + const int32_t* sizes = getSrcDataAtPortAs(TARGET_SHAPE_ID); for (size_t i = 0; i < lastSizes.size(); i++) { if (sizes[i] != lastSizes[i]) { return true; @@ -2226,12 +2324,12 @@ void Interpolate::executeDynamicImpl(dnnl::stream strm) { execute(strm); const size_t port = interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::sizes ? TARGET_SHAPE_ID : get_scale_id(); - const auto &memory = getParentEdgeAt(port)->getMemory(); + const auto& memory = getParentEdgeAt(port)->getMemory(); if (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales) { - const float *scales = memory.getDataAs(); + const float* scales = memory.getDataAs(); lastScales.assign(scales, scales + memory.getDesc().getShape().getElementsCount()); } else { - const int32_t *sizes = memory.getDataAs(); + const int32_t* sizes = memory.getDataAs(); lastSizes.assign(sizes, sizes + memory.getDesc().getShape().getElementsCount()); } } @@ -2284,19 +2382,19 @@ void Interpolate::prepareParams() { OPENVINO_THROW(errorPrefix, " has undefined axes memory"); } - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) OPENVINO_THROW(errorPrefix, " did not set preferable primitive descriptor"); - const auto &srcDimsOrign = srcMemPtr->getStaticDims(); - const auto &dstDimsOrign = dstMemPtr->getStaticDims(); + const auto& srcDimsOrign = srcMemPtr->getStaticDims(); + const auto& dstDimsOrign = dstMemPtr->getStaticDims(); VectorDims srcDims = srcDimsOrign; VectorDims dstDims = dstDimsOrign; // layoutAlignment if (NCHWAsNHWC && srcMemPtr->getDesc().hasLayoutType(LayoutType::ncsp)) { - auto logicalShapeAlign = [] (VectorDims& Dims) { + auto logicalShapeAlign = [](VectorDims& Dims) { size_t C = Dims[3]; Dims[3] = Dims[2]; Dims[2] = Dims[1]; @@ -2315,7 +2413,8 @@ void Interpolate::prepareParams() { } } - std::vector dataScales = getScales(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd), dstDims); + std::vector dataScales = + getScales(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd), dstDims); if (!NCHWAsNHWC && (getOutputShapeAtPort(0).getRank() > 2 && (dataScales[0] != 1.f || dataScales[1] != 1.f))) { OPENVINO_THROW("Interpolate layer only supports resize on spatial dimensions(depth, height and width)"); } @@ -2331,7 +2430,10 @@ void Interpolate::prepareParams() { dstMemoryDescs.push_back(getDstMemoryAtPort(0)->getDescPtr()); auto selectedPD = getSelectedPrimitiveDescriptor(); - aclExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(interpAttrs, srcMemoryDescs, dstMemoryDescs, {}); + aclExecPtr = selectedPD->getExecutorFactoryAs()->makeExecutor(interpAttrs, + srcMemoryDescs, + dstMemoryDescs, + {}); selectedPD->setImplementationType(aclExecPtr->getImplType()); return; @@ -2343,26 +2445,25 @@ void Interpolate::prepareParams() { auto buildExecutor = [&](const InterpolateKey& key) -> std::shared_ptr { std::shared_ptr executor; if ((key.nodeAttrs.mode == InterpolateMode::nearest || key.nodeAttrs.mode == InterpolateMode::linear_onnx || - key.nodeAttrs.mode == InterpolateMode::cubic) && + key.nodeAttrs.mode == InterpolateMode::cubic) && ((key.nodeAttrs.layout != InterpolateLayoutType::planar && mayiuse(cpu::x64::sse41)) || - (mayiuse(cpu::x64::avx2) && key.nodeAttrs.inPrc == ov::element::f32))) { + (mayiuse(cpu::x64::avx2) && key.nodeAttrs.inPrc == ov::element::f32))) { executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales, - key.attr); - } else if ((key.nodeAttrs.mode == InterpolateMode::bilinear_pillow || key.nodeAttrs.mode == InterpolateMode::bicubic_pillow) && - (key.nodeAttrs.layout == InterpolateLayoutType::by_channel)) { + key.srcDims, + key.dstDims, + key.dataScales, + key.attr); + } else if ((key.nodeAttrs.mode == InterpolateMode::bilinear_pillow || + key.nodeAttrs.mode == InterpolateMode::bicubic_pillow) && + (key.nodeAttrs.layout == InterpolateLayoutType::by_channel)) { executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales, - key.attr); + key.srcDims, + key.dstDims, + key.dataScales, + key.attr); } else { - executor = std::make_shared(key.nodeAttrs, - key.srcDims, - key.dstDims, - key.dataScales); + executor = + std::make_shared(key.nodeAttrs, key.srcDims, key.dstDims, key.dataScales); } return executor; }; @@ -2409,18 +2510,18 @@ static inline float triangleCoeff(float x) { return (std::max)(0.0f, 1 - std::abs(x)); } -void Interpolate::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims) { +void Interpolate::setPostOps(dnnl::primitive_attr& attr, const VectorDims& dims) { dnnl::post_ops ops; postOpsDataPtrs.clear(); - for (auto &node : fusedWith) { - auto* fakeQuantizeNode = dynamic_cast(node.get()); + for (auto& node : fusedWith) { + auto* fakeQuantizeNode = dynamic_cast(node.get()); if (fakeQuantizeNode) { fakeQuantizeNode->appendPostOps(ops, {}, postOpsDataPtrs); continue; } - auto* eltwiseNode = dynamic_cast(node.get()); + auto* eltwiseNode = dynamic_cast(node.get()); if (eltwiseNode) { eltwiseNode->appendPostOps(ops, dims, postOpsDataPtrs); continue; @@ -2436,9 +2537,9 @@ void Interpolate::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims) attr.set_post_ops(ops); } -VectorDims Interpolate::getPaddedInputShape(const VectorDims &srcDims, - const std::vector &padBegin, - const std::vector &padEnd) { +VectorDims Interpolate::getPaddedInputShape(const VectorDims& srcDims, + const std::vector& padBegin, + const std::vector& padEnd) { VectorDims paddedShape; int dataRank = srcDims.size(); for (int i = 0; i < dataRank; i++) { @@ -2450,18 +2551,21 @@ VectorDims Interpolate::getPaddedInputShape(const VectorDims &srcDims, // get scales of data rank size // if "scale" version: set scales with input scales, 1.f for other dims not in axis // if "size" version: scales = shape[target] / shape[input].pad, 1.f for other dims not in axis -// scales is a required input, but should not use input scales when "size" case, which may added eps or is a dummy value, recalculate scales instead. -std::vector Interpolate::getScales(const VectorDims &srcDimPad, const VectorDims &dstDim) { +// scales is a required input, but should not use input scales when "size" case, which may added eps or is a dummy +// value, recalculate scales instead. +std::vector Interpolate::getScales(const VectorDims& srcDimPad, const VectorDims& dstDim) { std::vector fullScales(dataRank, 1.f); const size_t axesRank = axes.size(); for (size_t i = 0; i < axesRank; i++) { int axis = axes[i]; // pillow always re-generate scales with input and output shape - if (interpAttrs.mode == InterpolateMode::bilinear_pillow || interpAttrs.mode == InterpolateMode::bicubic_pillow) { + if (interpAttrs.mode == InterpolateMode::bilinear_pillow || + interpAttrs.mode == InterpolateMode::bicubic_pillow) { fullScales[axis] = static_cast(dstDim[axis]) / static_cast(srcDimPad[axis]); } else { - fullScales[axis] = (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales) ? scales[i] : - static_cast(dstDim[axis]) / static_cast(srcDimPad[axis]); + fullScales[axis] = (interpAttrs.shapeCalcMode == InterpolateShapeCalcMode::scales) + ? scales[i] + : static_cast(dstDim[axis]) / static_cast(srcDimPad[axis]); } } return fullScales; @@ -2472,12 +2576,12 @@ void Interpolate::execute(dnnl::stream strm) { auto srcMemPtr = getSrcMemoryAtPort(DATA_ID); if (execPtr) { - uint8_t *dst_data = dstMemPtr->getDataAs(); - const uint8_t *src_data_origin = srcMemPtr->getDataAs(); - const uint8_t *src_data = nullptr; + uint8_t* dst_data = dstMemPtr->getDataAs(); + const uint8_t* src_data_origin = srcMemPtr->getDataAs(); + const uint8_t* src_data = nullptr; std::vector srcPadded; if (hasPad) { - const auto &srcDim = srcMemPtr->getStaticDims(); + const auto& srcDim = srcMemPtr->getStaticDims(); auto srcDimPad = execPtr->getSrcDimPad5d(); size_t dimSize = srcDim.size(); @@ -2496,23 +2600,34 @@ void Interpolate::execute(dnnl::stream strm) { if (interpAttrs.layout == InterpolateLayoutType::planar) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) { - const uint8_t *src = src_data_origin + - (inShapeBlock[1] * n + inShapeBlock[2] * c + inShapeBlock[3] * d + inShapeBlock[4] * h) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + - inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * srcDataSize; + const uint8_t* src = src_data_origin + (inShapeBlock[1] * n + inShapeBlock[2] * c + + inShapeBlock[3] * d + inShapeBlock[4] * h) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + (inShapePadBlock[1] * (n + padB0) + inShapePadBlock[2] * (c + padB1) + + inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + padB4) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize); }); src_data = src_data_pad; } else if (interpAttrs.layout == InterpolateLayoutType::by_channel) { srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) { - const uint8_t *src = src_data_origin + (inShapeBlock[1] * n + - (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * srcDataSize; - uint8_t *srcPad = src_data_pad + (inShapePadBlock[1] * (n + padB0) + (inShapePadBlock[3] * (d + padB2) + - inShapePadBlock[4] * (h + padB3) + inShapePadBlock[5] * (w + padB4)) * srcDimPad5d[1] + padB1) * srcDataSize; + const uint8_t* src = + src_data_origin + + (inShapeBlock[1] * n + + (inShapeBlock[3] * d + inShapeBlock[4] * h + inShapeBlock[5] * w) * srcDim5d[1]) * + srcDataSize; + uint8_t* srcPad = + src_data_pad + (inShapePadBlock[1] * (n + padB0) + + (inShapePadBlock[3] * (d + padB2) + inShapePadBlock[4] * (h + padB3) + + inShapePadBlock[5] * (w + padB4)) * + srcDimPad5d[1] + + padB1) * + srcDataSize; cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize); }); src_data = src_data_pad; @@ -2521,25 +2636,34 @@ void Interpolate::execute(dnnl::stream strm) { size_t CB = div_up(srcDimPad5d[1], blkSize); size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize; srcPadded.resize(eltsTotal * srcDataSize, 0x0); - uint8_t *src_data_pad = static_cast(&srcPadded[0]); + uint8_t* src_data_pad = static_cast(&srcPadded[0]); if ((srcDim5d[0] != srcDimPad5d[0]) || (srcDim5d[1] != srcDimPad5d[1])) { OPENVINO_THROW("Interpolate layer with name '", getName(), "' does not support padding on batch and channel dimensions"); } - parallel_for5d(srcDim5d[0], CB, srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int cb, int d, int h, int w) { - const uint8_t *src = src_data_origin + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize - + (h * srcDim5d[4] * blkSize) * srcDataSize - + (w * blkSize) * srcDataSize; - uint8_t *srcPad = src_data_pad + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize - + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize - + ((w + padB4) * blkSize) * srcDataSize; - cpu_memcpy(srcPad, src, blkSize * srcDataSize); - }); + parallel_for5d(srcDim5d[0], + CB, + srcDim5d[2], + srcDim5d[3], + srcDim5d[4], + [&](int n, int cb, int d, int h, int w) { + const uint8_t* src = + src_data_origin + + (n * CB * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (cb * srcDim5d[2] * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (d * srcDim5d[3] * srcDim5d[4] * blkSize) * srcDataSize + + (h * srcDim5d[4] * blkSize) * srcDataSize + (w * blkSize) * srcDataSize; + uint8_t* srcPad = + src_data_pad + + (n * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * + srcDataSize + + (cb * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((d + padB2) * srcDimPad5d[3] * srcDimPad5d[4] * blkSize) * srcDataSize + + ((h + padB3) * srcDimPad5d[4] * blkSize) * srcDataSize + + ((w + padB4) * blkSize) * srcDataSize; + cpu_memcpy(srcPad, src, blkSize * srcDataSize); + }); src_data = src_data_pad; } } else { @@ -2556,26 +2680,35 @@ void Interpolate::execute(dnnl::stream strm) { // for ndhwc and nCdhw8c[16c] // input may be f32/bf16/int8, fused->output varies -void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { - int *index_d = static_cast(&auxTable[0]); - int *index_h = static_cast(&auxTable[OD]); - int *index_w = static_cast(&auxTable[OD + OH]); +void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + int* index_d = static_cast(&auxTable[0]); + int* index_h = static_cast(&auxTable[OD]); + int* index_w = static_cast(&auxTable[OD + OH]); bool is_nhwc = (configured_for_layout == by_channel); for (int b = 0; b < B; b++) { if (is_nhwc) { - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b) * dstDataSize; + const uint8_t* in_ptr = in_ptr_ + (IW * IH * ID * C * b) * srcDataSize; + uint8_t* out_ptr = out_ptr_ + (OW * OH * OD * C * b) * dstDataSize; std::vector index_w_kernel(OW); for (int ox = 0; ox < OW; ox++) { index_w_kernel[ox] = index_w[ox] * C * srcDataSize; } parallel_for2d(OD, OH, [&](size_t d, size_t h) { // kernel for C * OW - uint8_t *out_ptr_dh = out_ptr + (C * OW * OH * d + C * OW * h) * dstDataSize; - const uint8_t *in_ptr_dh = in_ptr + (C * IW * IH * index_d[d] + C * IW * index_h[h]) * srcDataSize; + uint8_t* out_ptr_dh = out_ptr + (C * OW * OH * d + C * OW * h) * dstDataSize; + const uint8_t* in_ptr_dh = in_ptr + (C * IW * IH * index_d[d] + C * IW * index_h[h]) * srcDataSize; auto arg = jit_interpolate_call_args(); arg.dst = out_ptr_dh; arg.src_ptr[0] = in_ptr_dh; @@ -2588,15 +2721,16 @@ void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t *in_ptr_, ui } else { // for blk int blk_size = mayiuse(cpu::x64::avx512_core) ? 16 : 8; int CB = div_up(C, blk_size); - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize; + const uint8_t* in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize; + uint8_t* out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize; std::vector index_w_kernel(OW); for (int ox = 0; ox < OW; ox++) { index_w_kernel[ox] = index_w[ox] * blk_size * srcDataSize; } parallel_for2d(CB, OD, [&](size_t cb, size_t d) { - uint8_t *out_ptr_cbd = out_ptr + (blk_size * OW * OH * OD * cb + blk_size * OW * OH * d) * dstDataSize; - const uint8_t *in_ptr_cbd = in_ptr + (blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d]) * srcDataSize; + uint8_t* out_ptr_cbd = out_ptr + (blk_size * OW * OH * OD * cb + blk_size * OW * OH * d) * dstDataSize; + const uint8_t* in_ptr_cbd = + in_ptr + (blk_size * IW * IH * ID * cb + blk_size * IW * IH * index_d[d]) * srcDataSize; auto arg = jit_interpolate_call_args(); for (int h = 0; h < OH; h++) { // kernel for blk_size * OW arg.dst = out_ptr_cbd + blk_size * OW * h * dstDataSize; @@ -2612,11 +2746,20 @@ void Interpolate::InterpolateJitExecutor::NNCGathered(const uint8_t *in_ptr_, ui } // batch end } -void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { - int *index_d = static_cast(&auxTable[0]); - int *index_h = static_cast(&auxTable[OD]); - int *index_w = static_cast(&auxTable[OD + OH]); +void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + int* index_d = static_cast(&auxTable[0]); + int* index_h = static_cast(&auxTable[OD]); + int* index_w = static_cast(&auxTable[OD + OH]); std::vector index_kernel(OH + OW); // index_h * IW * srcDataSize to reduce and simplify redundant compute @@ -2629,13 +2772,15 @@ void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t *in_ptr_, uint8 } parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { - const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]) * srcDataSize; - uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od) * dstDataSize; + const uint8_t* in_ptr = + in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]) * srcDataSize; + uint8_t* out_ptr = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od) * dstDataSize; auto arg = jit_interpolate_call_args(); arg.src_ptr[0] = in_ptr; arg.dst = out_ptr; - arg.index = static_cast(&index_kernel[0]); // need index_h and index_w in kernel, it's in continous memory so one param + arg.index = static_cast( + &index_kernel[0]); // need index_h and index_w in kernel, it's in continous memory so one param arg.oc_off = static_cast(c * sizeof(float)); // work_amount is OH(out loop) and OW(inner loop), can get in kernel from jcp. arg.post_op_data = post_ops_data_; @@ -2643,18 +2788,27 @@ void Interpolate::InterpolateJitExecutor::NNPlanar(const uint8_t *in_ptr_, uint8 }); } -void Interpolate::InterpolateJitExecutor::linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, int B, int C, - int ID, int IH, int IW, int OD, int OH, int OW) { - // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, EndTopLeft:4, EndTopRight:5, EndBottomLeft:6, EndBottomRight:7 - // weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 - int *index = static_cast(&auxTable[0]); +void Interpolate::InterpolateJitExecutor::linearOnnxPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, EndTopLeft:4, EndTopRight:5, + // EndBottomLeft:6, EndBottomRight:7 weight: Left:0, ritht:1, top:2, bottom:3, front:4, end:5 + int* index = static_cast(&auxTable[0]); int eltInGrid = (spatialDimSize > 2) ? MAX_INPUT_INTERPOLATE : ((spatialDimSize > 1) ? 4 : 2); int scratchLen = rnd_up(eltInGrid * OW * OH * OD, 16); - float *weight = reinterpret_cast(&auxTable[scratchLen]); + float* weight = reinterpret_cast(&auxTable[scratchLen]); parallel_for2d(B, C, [&](size_t b, size_t c) { - uint8_t *out_ptr_nc = out_ptr_ + (OH * OW * OD * C * b + OH * OW * OD * c) * dstDataSize; - const uint8_t *in_ptr_nc = in_ptr_ + (IH * IW * ID * C * b + IH * IW * ID * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OH * OW * OD * C * b + OH * OW * OD * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (IH * IW * ID * C * b + IH * IW * ID * c) * srcDataSize; auto arg = jit_interpolate_call_args(); arg.src_ptr[0] = in_ptr_nc; arg.index = static_cast(&index[0]); @@ -2667,8 +2821,17 @@ void Interpolate::InterpolateJitExecutor::linearOnnxPlanar(const uint8_t *in_ptr }); } -void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { // left:OW right:OW Top:OH Bottom:OH Front:OD End:OD std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); @@ -2703,18 +2866,18 @@ void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t *in_ int I2 = ID * I1; int I3 = CB * I2; parallel_for3d(B, OD, OH, [&](size_t b, size_t d, size_t h) { - uint8_t *out_ptr_ndh = out_ptr_ + (C3 * b + C1 * d + C0 * h) * dstDataSize; - - const uint8_t *in_ptr_n = in_ptr_ + (I3 * b) * srcDataSize; - const uint8_t *in_ptr_nf = in_ptr_n + (indexPtr[4][d] * I1) * srcDataSize; - const uint8_t *in_ptr_nft = in_ptr_nf + (indexPtr[2][h] * I0) * srcDataSize; - const uint8_t *in_ptr_nfb = in_ptr_nf + (indexPtr[3][h] * I0) * srcDataSize; - const uint8_t *in_ptr_ne = in_ptr_n + (indexPtr[5][d] * I1) * srcDataSize; - const uint8_t *in_ptr_net = in_ptr_ne + (indexPtr[2][h] * I0) * srcDataSize; - const uint8_t *in_ptr_neb = in_ptr_ne + (indexPtr[3][h] * I0) * srcDataSize; + uint8_t* out_ptr_ndh = out_ptr_ + (C3 * b + C1 * d + C0 * h) * dstDataSize; + + const uint8_t* in_ptr_n = in_ptr_ + (I3 * b) * srcDataSize; + const uint8_t* in_ptr_nf = in_ptr_n + (indexPtr[4][d] * I1) * srcDataSize; + const uint8_t* in_ptr_nft = in_ptr_nf + (indexPtr[2][h] * I0) * srcDataSize; + const uint8_t* in_ptr_nfb = in_ptr_nf + (indexPtr[3][h] * I0) * srcDataSize; + const uint8_t* in_ptr_ne = in_ptr_n + (indexPtr[5][d] * I1) * srcDataSize; + const uint8_t* in_ptr_net = in_ptr_ne + (indexPtr[2][h] * I0) * srcDataSize; + const uint8_t* in_ptr_neb = in_ptr_ne + (indexPtr[3][h] * I0) * srcDataSize; auto arg = jit_interpolate_call_args(); for (int w = 0; w < OW; ++w) { - uint8_t *out_ptr_ndhw = out_ptr_ndh + CGatherLen * w * dstDataSize; + uint8_t* out_ptr_ndhw = out_ptr_ndh + CGatherLen * w * dstDataSize; arg.src_ptr[0] = in_ptr_nft + (indexPtr[0][w] * CGatherLen) * srcDataSize; arg.src_ptr[1] = in_ptr_nft + (indexPtr[1][w] * CGatherLen) * srcDataSize; @@ -2739,13 +2902,20 @@ void Interpolate::InterpolateJitExecutor::linearOnnxCGathered(const uint8_t *in_ }); } -void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { const int idxNum = 1; - int *xOrigin = static_cast(&auxTable[0]); - float *xFactor = reinterpret_cast(&auxTable[OW]); - int *yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); - float *yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); + int* xOrigin = static_cast(&auxTable[0]); + float* xFactor = reinterpret_cast(&auxTable[OW]); + int* yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); + float* yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); int blkSize = mayiuse(cpu::x64::avx512_core) ? 16 : 8; int CB = div_up(C, blkSize); @@ -2754,8 +2924,8 @@ void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t *in_ptr_, int workAmount = configured_for_layout == InterpolateLayoutType::by_channel ? C : CB; parallel_for3d(B, OH, OW, [&](size_t b, size_t h, size_t w) { - uint8_t *out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize; - const uint8_t *in_ptr_n = in_ptr_ + (IH * IW * CSize * b) * srcDataSize; + uint8_t* out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize; + const uint8_t* in_ptr_n = in_ptr_ + (IH * IW * CSize * b) * srcDataSize; std::vector kernelIndex(CUBIC_GRID_LEN * CUBIC_GRID_LEN); // 16 address offset to src(batch) or src(CB) int iy = yOrigin[h]; @@ -2770,41 +2940,48 @@ void Interpolate::InterpolateJitExecutor::cubicCGathered(const uint8_t *in_ptr_, } } auto arg = jit_interpolate_call_args(); - arg.dst = out_ptr_nhw; - arg.src_ptr[0] = in_ptr_n; - arg.index = static_cast(&kernelIndex[0]); - // 0 for weight_W, 1 for weight_H - arg.weight_ptr[0] = static_cast(&xFactor[w * CUBIC_GRID_LEN]); - arg.weight_ptr[1] = static_cast(&yFactor[h * CUBIC_GRID_LEN]); - - // for by channel, src + step, dst + step, process next step on continuous memory - // for blk, src + IW*IH*blkSize, dst + OW*OH*blkSize, process the blkSize on next CB - arg.work_amount = workAmount; - arg.oc_off = 0; - arg.post_op_data = post_ops_data_; - (*interpolateKernel)(&arg); + arg.dst = out_ptr_nhw; + arg.src_ptr[0] = in_ptr_n; + arg.index = static_cast(&kernelIndex[0]); + // 0 for weight_W, 1 for weight_H + arg.weight_ptr[0] = static_cast(&xFactor[w * CUBIC_GRID_LEN]); + arg.weight_ptr[1] = static_cast(&yFactor[h * CUBIC_GRID_LEN]); + + // for by channel, src + step, dst + step, process next step on continuous memory + // for blk, src + IW*IH*blkSize, dst + OW*OH*blkSize, process the blkSize on next CB + arg.work_amount = workAmount; + arg.oc_off = 0; + arg.post_op_data = post_ops_data_; + (*interpolateKernel)(&arg); }); } -void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { int tblAdvance = 0; - int *xOrigin = static_cast(&auxTable[tblAdvance]); + int* xOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OW; - float *xFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* xFactor = reinterpret_cast(&auxTable[tblAdvance]); tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&auxTable[tblAdvance]); + int* yOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OH; - float *yFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* yFactor = reinterpret_cast(&auxTable[tblAdvance]); tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&auxTable[tblAdvance]); + int* sequenceOH = static_cast(&auxTable[tblAdvance]); tblAdvance += OW * OH; - int *sequenceOW = static_cast(&auxTable[tblAdvance]); + int* sequenceOW = static_cast(&auxTable[tblAdvance]); parallel_for2d(B, C, [&](size_t n, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * n + IW * IH * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * n + OW * OH * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (IW * IH * C * n + IW * IH * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OW * OH * C * n + OW * OH * c) * dstDataSize; auto arg = jit_interpolate_call_args(); arg.dst = out_ptr_nc; @@ -2822,8 +2999,15 @@ void Interpolate::InterpolateJitExecutor::cubicPlanar(const uint8_t *in_ptr_, ui }); } -void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { // workBuffer needed when both pass are true bool xPass = IW != OW; bool yPass = IH != OH; @@ -2855,8 +3039,11 @@ void Interpolate::InterpolateJitExecutor::pillowCGathered(const uint8_t *in_ptr_ // ===================================================================================================================== // index layout: // d_0............d_OD-1, h_0..............h_OH-1, w_0................w_OW-1 -void Interpolate::InterpolateExecutorBase::buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout, InterpolateNearestMode nearestMode) { +void Interpolate::InterpolateExecutorBase::buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode) { const int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; float fy = dataScales[dimSize - 2]; @@ -2888,80 +3075,91 @@ void Interpolate::InterpolateExecutorBase::buildTblNN(const VectorDims& srcDimPa // scale is float(outShape) / float(inShape) // strictly consistent with onnx calc manner(div scale, not multiply inverse), given this is done offline // the slight precison diff can produce obvious wrong value due to "nearest round" behavior for NN mode -float Interpolate::InterpolateExecutorBase::coordTransToInput(int outCoord, float scale, int inShape, int outShape) const { +float Interpolate::InterpolateExecutorBase::coordTransToInput(int outCoord, + float scale, + int inShape, + int outShape) const { if (scale == 1.0f || (inShape == outShape)) { return outCoord; } switch (coordTransMode) { - case InterpolateCoordTransMode::half_pixel: { + case InterpolateCoordTransMode::half_pixel: { + return (outCoord + 0.5f) / scale - 0.5f; + break; + } + case InterpolateCoordTransMode::pytorch_half_pixel: { + if (outShape > 1) return (outCoord + 0.5f) / scale - 0.5f; - break; - } - case InterpolateCoordTransMode::pytorch_half_pixel: { - if (outShape > 1) - return (outCoord + 0.5f) / scale - 0.5f; - else - return 0; - break; - } - case InterpolateCoordTransMode::asymmetric: { - return static_cast(outCoord) / scale; - break; - } - case InterpolateCoordTransMode::tf_half_pixel_for_nn: { - return (outCoord + 0.5f) / scale; - break; - } - case InterpolateCoordTransMode::align_corners: { - if (outShape > 1) - return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); - else - return 0; - break; - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); - break; - } + else + return 0; + break; + } + case InterpolateCoordTransMode::asymmetric: { + return static_cast(outCoord) / scale; + break; + } + case InterpolateCoordTransMode::tf_half_pixel_for_nn: { + return (outCoord + 0.5f) / scale; + break; + } + case InterpolateCoordTransMode::align_corners: { + if (outShape > 1) + return outCoord * (static_cast(inShape - 1) / static_cast(outShape - 1)); + else + return 0; + break; + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified coordinate transformation mode"); + break; + } } } -int Interpolate::InterpolateExecutorBase::nearestRound(float originCoord, bool isDownsample, InterpolateNearestMode nearestMode) const { +int Interpolate::InterpolateExecutorBase::nearestRound(float originCoord, + bool isDownsample, + InterpolateNearestMode nearestMode) const { switch (nearestMode) { - case InterpolateNearestMode::round_prefer_floor: { - if (originCoord == (static_cast(originCoord) + 0.5f)) - return static_cast(std::floor(originCoord)); - else - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::round_prefer_ceil: { - return static_cast(std::round(originCoord)); - break; - } - case InterpolateNearestMode::floor: { + case InterpolateNearestMode::round_prefer_floor: { + if (originCoord == (static_cast(originCoord) + 0.5f)) return static_cast(std::floor(originCoord)); - break; - } - case InterpolateNearestMode::ceil: { + else + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::round_prefer_ceil: { + return static_cast(std::round(originCoord)); + break; + } + case InterpolateNearestMode::floor: { + return static_cast(std::floor(originCoord)); + break; + } + case InterpolateNearestMode::ceil: { + return static_cast(std::ceil(originCoord)); + break; + } + case InterpolateNearestMode::simple: { + if (isDownsample) return static_cast(std::ceil(originCoord)); - break; - } - case InterpolateNearestMode::simple: { - if (isDownsample) - return static_cast(std::ceil(originCoord)); - else - return static_cast(originCoord); - } - default: { - OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); - break; - } + else + return static_cast(originCoord); + } + default: { + OPENVINO_THROW("errorPrefix", " does not support specified nearest round mode"); + break; + } } } -void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, float scale, int inShape, int outShape, - int& index0, int& index1, float& weight0, float& weight1) { +void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1) { float inCoord = coordTransToInput(outCoord, scale, inShape, outShape); inCoord = std::max(0.0f, std::min(inCoord, static_cast(inShape - 1))); index0 = std::min(static_cast(inCoord), inShape - 1); @@ -2975,8 +3173,10 @@ void Interpolate::InterpolateExecutorBase::linearOnnxCF(int outCoord, float scal } } -void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, InterpolateLayoutType layout) { +void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout) { int dimSize = dataRank; float fz = (spatialDimSize > 2) ? dataScales[dimSize - 3] : 1.f; float fy = (spatialDimSize > 1) ? dataScales[dimSize - 2] : 1.f; @@ -3035,7 +3235,7 @@ void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& indexPtr[1][idxOzOyOx] = (izF * IH * IW + iyT * IW + ixR) * scale; weightPtr[0][idxOzOyOx] = weightL; weightPtr[1][idxOzOyOx] = weightR; - if (spatialDimSize > 1) { + if (spatialDimSize > 1) { indexPtr[2][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixL) * scale; indexPtr[3][idxOzOyOx] = (izF * IH * IW + iyB * IW + ixR) * scale; weightPtr[2][idxOzOyOx] = weightT; @@ -3088,8 +3288,11 @@ void Interpolate::InterpolateExecutorBase::buildTblLinearOnnx(const VectorDims& // wd .........wd, wh............wh, ww.............ww, id...........id, ih............ih, iw..............iw // | | // wh0.....wh_diameter ih0.....ih_diameter -void Interpolate::InterpolateExecutorBase::buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, - const std::vector& dataScales, int kernel_width, bool antialias) { +void Interpolate::InterpolateExecutorBase::buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, + bool antialias) { int dimSize = dataRank; float fz = (dimSize == 5) ? dataScales[dimSize - 3] : 1.f; float fy = dataScales[dimSize - 2]; @@ -3113,15 +3316,15 @@ void Interpolate::InterpolateExecutorBase::buildTblLinear(const VectorDims& srcD int sizeOH = OH * diaOH; int sizeOW = OW * diaOW; auxTable.resize((sizeOD + sizeOH + sizeOW) * 2); - float *weightTable = reinterpret_cast(&auxTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + float* weightTable = reinterpret_cast(&auxTable[0]); + float* weightOD = static_cast(&weightTable[0]); + float* weightOH = static_cast(&weightTable[sizeOD]); + float* weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - int *idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + int* idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); + int* idxOD = static_cast(&idxTable[0]); + int* idxOH = static_cast(&idxTable[sizeOD]); + int* idxOW = static_cast(&idxTable[sizeOD + sizeOH]); for (size_t oz = 0; oz < OD; oz++) { float iz = coordTransToInput(oz, fz, ID, OD); @@ -3179,8 +3382,11 @@ std::vector Interpolate::InterpolateExecutorBase::getCubicCoeffs(float ma // table layout: // OW OW OW OW OW OH OH OH OH OH // x_idx x_weight0 x_weight1 x_weight2 x_weight3 y_idx y_weight0 y_weight1 y_weight2 y_weight3 -void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout) { +void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout) { int dimSize = dataRank; float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; @@ -3198,9 +3404,9 @@ void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDi } int tblAdvance = 0; - int *xOrigin = static_cast(&auxTable[tblAdvance]); + int* xOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OW; - float *xFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* xFactor = reinterpret_cast(&auxTable[tblAdvance]); for (int ox = 0; ox < OW; ox++) { float ix = coordTransToInput(ox, fx, IW, OW); int ix_r = static_cast(std::floor(ix)); @@ -3214,9 +3420,9 @@ void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDi } tblAdvance += CUBIC_GRID_LEN * OW; - int *yOrigin = static_cast(&auxTable[tblAdvance]); + int* yOrigin = static_cast(&auxTable[tblAdvance]); tblAdvance += OH; - float *yFactor = reinterpret_cast(&auxTable[tblAdvance]); + float* yFactor = reinterpret_cast(&auxTable[tblAdvance]); for (int oy = 0; oy < OH; oy++) { float iy = coordTransToInput(oy, fy, IH, OH); int iy_r = static_cast(std::floor(iy)); @@ -3231,9 +3437,9 @@ void Interpolate::InterpolateExecutorBase::buildTblCubic(const VectorDims& srcDi if (layout == InterpolateLayoutType::planar) { tblAdvance += CUBIC_GRID_LEN * OH; - int *sequenceOH = static_cast(&auxTable[tblAdvance]); + int* sequenceOH = static_cast(&auxTable[tblAdvance]); tblAdvance += OH * OW; - int *sequenceOW = static_cast(&auxTable[tblAdvance]); + int* sequenceOW = static_cast(&auxTable[tblAdvance]); for (int h = 0; h < OH; ++h) { int offset = h * OW; for (int w = 0; w < OW; ++w) { @@ -3263,8 +3469,11 @@ float Interpolate::InterpolateExecutorBase::getPillowBicubicCoeffs(float m) { return 0.0f; } -void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout) { +void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout) { int dimSize = dataRank; float fy = dataScales[dimSize - 2]; float fx = dataScales[dimSize - 1]; @@ -3279,15 +3488,15 @@ void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcD }; // pillowScale: e.g. 2.0 means down sample 2 times - auto generateArgs = [&] (float pillowScale) -> filterArgs { + auto generateArgs = [&](float pillowScale) -> filterArgs { filterArgs args; float scaleClip = pillowScale < 1.0f ? 1.0f : pillowScale; args.ScaleClipReciprocal = 1.0f / scaleClip; - args.filterRadius = (mode == InterpolateMode::bilinear_pillow) ? PILLOW_BILINEAR_WINDOW_SCALE * scaleClip : - PILLOW_BICUBIC_WINDOW_SCALE * scaleClip; + args.filterRadius = (mode == InterpolateMode::bilinear_pillow) ? PILLOW_BILINEAR_WINDOW_SCALE * scaleClip + : PILLOW_BICUBIC_WINDOW_SCALE * scaleClip; args.filterLen = static_cast(std::ceil(args.filterRadius) * 2 + 1); - args.weightGen = (mode == InterpolateMode::bilinear_pillow) ? this->getPillowBilinearCoeffs: - this->getPillowBicubicCoeffs; + args.weightGen = + (mode == InterpolateMode::bilinear_pillow) ? this->getPillowBilinearCoeffs : this->getPillowBicubicCoeffs; return args; }; @@ -3302,15 +3511,15 @@ void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcD auxTable[offset] = filterArgsX.filterLen; auxTable[offset + 1] = filterArgsY.filterLen; offset += 2; - float *weightX = reinterpret_cast(&auxTable[offset]); + float* weightX = reinterpret_cast(&auxTable[offset]); offset += filterArgsX.filterLen * OW; - float *weightY = reinterpret_cast(&auxTable[offset]); + float* weightY = reinterpret_cast(&auxTable[offset]); offset += filterArgsY.filterLen * OH; - int *indexX = static_cast(&auxTable[offset]); + int* indexX = static_cast(&auxTable[offset]); offset += 2 * OW; - int *indexY = static_cast(&auxTable[offset]); + int* indexY = static_cast(&auxTable[offset]); - auto generateTbl = [&] (int inLen, int outLen, float fScale, filterArgs args, float* weightTbl, int* idxTbl) { + auto generateTbl = [&](int inLen, int outLen, float fScale, filterArgs args, float* weightTbl, int* idxTbl) { int min = 0; int max = 0; for (int ox = 0; ox < outLen; ox++) { @@ -3354,21 +3563,29 @@ void Interpolate::InterpolateExecutorBase::buildTblPillow(const VectorDims& srcD generateTbl(IH, OH, fy, filterArgsY, weightY, indexY); } -void Interpolate::InterpolateRefExecutor::NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - int OD, int OH, int OW) { - int *index_d = static_cast(&auxTable[0]); - int *index_h = static_cast(&auxTable[OD]); - int *index_w = static_cast(&auxTable[OD + OH]); - - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); +void Interpolate::InterpolateRefExecutor::NNRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { + int* index_d = static_cast(&auxTable[0]); + int* index_h = static_cast(&auxTable[OD]); + int* index_w = static_cast(&auxTable[OD + OH]); + + const float* in_ptr_f32 = reinterpret_cast(in_ptr_); + float* out_ptr_f32 = reinterpret_cast(out_ptr_); parallel_for3d(B, C, OD, [&](size_t b, size_t c, size_t od) { - const float *in_ptr = in_ptr_f32 + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]); - float *out_ptr = out_ptr_f32 + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od); + const float* in_ptr = in_ptr_f32 + (IW * IH * ID * C * b + IW * IH * ID * c + IW * IH * index_d[od]); + float* out_ptr = out_ptr_f32 + (OW * OH * OD * C * b + OW * OH * OD * c + OW * OH * od); for (int oh = 0; oh < OH; oh++) { - const float *in_ptr_h = in_ptr + (IW * index_h[oh]); - float *out_ptr_h = out_ptr + (OW * oh); + const float* in_ptr_h = in_ptr + (IW * index_h[oh]); + float* out_ptr_h = out_ptr + (OW * oh); for (int ow = 0; ow < OW; ow++) { out_ptr_h[ow] = in_ptr_h[index_w[ow]]; } @@ -3376,8 +3593,16 @@ void Interpolate::InterpolateRefExecutor::NNRef(const uint8_t *in_ptr_, uint8_t }); } -void Interpolate::InterpolateRefExecutor::linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - int OD, int OH, int OW) { +void Interpolate::InterpolateRefExecutor::linearOnnxRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW) { std::vector indexPtr(MAX_INPUT_INTERPOLATE, 0); std::vector weightPtr(MAX_INPUT_INTERPOLATE, 0); // FrontTopLeft:0, FrontTopRight:1, FrontBottomLeft:2, FrontBottomRight:3, @@ -3406,87 +3631,87 @@ void Interpolate::InterpolateRefExecutor::linearOnnxRef(const uint8_t *in_ptr_, weightPtr[5] = reinterpret_cast(&auxTable[scratchLen + 5 * OW * OH * OD]); } - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); + const float* in_ptr_f32 = reinterpret_cast(in_ptr_); + float* out_ptr_f32 = reinterpret_cast(out_ptr_); parallel_for2d(B, C, [&](size_t b, size_t c) { - float *out_ptr_nc = out_ptr_f32 + (OD * OH * OW * C * b + OD * OH * OW * c); - const float *in_ptr_nc = in_ptr_f32 + (ID * IH * IW * C * b + ID * IH * IW * c); + float* out_ptr_nc = out_ptr_f32 + (OD * OH * OW * C * b + OD * OH * OW * c); + const float* in_ptr_nc = in_ptr_f32 + (ID * IH * IW * C * b + ID * IH * IW * c); // do not combined 1d/2d to 3d unified process to get rid of invalid computing. switch (spatialDimSize) { - case 1: - for (int i = 0; i < OW; i++) { - float src0 = in_ptr_nc[indexPtr[0][i]]; - float src1 = in_ptr_nc[indexPtr[1][i]]; + case 1: + for (int i = 0; i < OW; i++) { + float src0 = in_ptr_nc[indexPtr[0][i]]; + float src1 = in_ptr_nc[indexPtr[1][i]]; - out_ptr_nc[i] = src0 * weightPtr[0][i] + - src1 * weightPtr[1][i]; - } - break; - case 2: - for (int i = 0; i < OH * OW; i++) { - float src00 = in_ptr_nc[indexPtr[0][i]]; - float src01 = in_ptr_nc[indexPtr[1][i]]; - float src10 = in_ptr_nc[indexPtr[2][i]]; - float src11 = in_ptr_nc[indexPtr[3][i]]; - - out_ptr_nc[i] = src00 * weightPtr[2][i] * weightPtr[0][i] + - src01 * weightPtr[2][i] * weightPtr[1][i] + - src10 * weightPtr[3][i] * weightPtr[0][i] + - src11 * weightPtr[3][i] * weightPtr[1][i]; - } - break; - case 3: - for (int i = 0; i < OD * OH * OW; i++) { - float src000 = in_ptr_nc[indexPtr[0][i]]; - float src001 = in_ptr_nc[indexPtr[1][i]]; - float src010 = in_ptr_nc[indexPtr[2][i]]; - float src011 = in_ptr_nc[indexPtr[3][i]]; - float src100 = in_ptr_nc[indexPtr[4][i]]; - float src101 = in_ptr_nc[indexPtr[5][i]]; - float src110 = in_ptr_nc[indexPtr[6][i]]; - float src111 = in_ptr_nc[indexPtr[7][i]]; - - // float dstValue = - // weightPtr[4][i] * weightPtr[2][i] * weightPtr[0][i] * src000 + - // weightPtr[4][i] * weightPtr[2][i] * weightPtr[1][i] * src001 + - // weightPtr[4][i] * weightPtr[3][i] * weightPtr[0][i] * src010 + - // weightPtr[4][i] * weightPtr[3][i] * weightPtr[1][i] * src011 + - // weightPtr[5][i] * weightPtr[2][i] * weightPtr[0][i] * src100 + - // weightPtr[5][i] * weightPtr[2][i] * weightPtr[1][i] * src101 + - // weightPtr[5][i] * weightPtr[3][i] * weightPtr[0][i] * src110 + - // weightPtr[5][i] * weightPtr[3][i] * weightPtr[1][i] * src111; - - out_ptr_nc[i] = - weightPtr[4][i] * (weightPtr[2][i] * (weightPtr[0][i] * src000 + - weightPtr[1][i] * src001) + - weightPtr[3][i] * (weightPtr[0][i] * src010 + - weightPtr[1][i] * src011)) + - weightPtr[5][i] * (weightPtr[2][i] * (weightPtr[0][i] * src100 + - weightPtr[1][i] * src101) + - weightPtr[3][i] * (weightPtr[0][i] * src110 + - weightPtr[1][i] * src111)); - } - break; - default: - break; + out_ptr_nc[i] = src0 * weightPtr[0][i] + src1 * weightPtr[1][i]; + } + break; + case 2: + for (int i = 0; i < OH * OW; i++) { + float src00 = in_ptr_nc[indexPtr[0][i]]; + float src01 = in_ptr_nc[indexPtr[1][i]]; + float src10 = in_ptr_nc[indexPtr[2][i]]; + float src11 = in_ptr_nc[indexPtr[3][i]]; + + out_ptr_nc[i] = src00 * weightPtr[2][i] * weightPtr[0][i] + src01 * weightPtr[2][i] * weightPtr[1][i] + + src10 * weightPtr[3][i] * weightPtr[0][i] + src11 * weightPtr[3][i] * weightPtr[1][i]; + } + break; + case 3: + for (int i = 0; i < OD * OH * OW; i++) { + float src000 = in_ptr_nc[indexPtr[0][i]]; + float src001 = in_ptr_nc[indexPtr[1][i]]; + float src010 = in_ptr_nc[indexPtr[2][i]]; + float src011 = in_ptr_nc[indexPtr[3][i]]; + float src100 = in_ptr_nc[indexPtr[4][i]]; + float src101 = in_ptr_nc[indexPtr[5][i]]; + float src110 = in_ptr_nc[indexPtr[6][i]]; + float src111 = in_ptr_nc[indexPtr[7][i]]; + + // float dstValue = + // weightPtr[4][i] * weightPtr[2][i] * weightPtr[0][i] * src000 + + // weightPtr[4][i] * weightPtr[2][i] * weightPtr[1][i] * src001 + + // weightPtr[4][i] * weightPtr[3][i] * weightPtr[0][i] * src010 + + // weightPtr[4][i] * weightPtr[3][i] * weightPtr[1][i] * src011 + + // weightPtr[5][i] * weightPtr[2][i] * weightPtr[0][i] * src100 + + // weightPtr[5][i] * weightPtr[2][i] * weightPtr[1][i] * src101 + + // weightPtr[5][i] * weightPtr[3][i] * weightPtr[0][i] * src110 + + // weightPtr[5][i] * weightPtr[3][i] * weightPtr[1][i] * src111; + + out_ptr_nc[i] = + weightPtr[4][i] * (weightPtr[2][i] * (weightPtr[0][i] * src000 + weightPtr[1][i] * src001) + + weightPtr[3][i] * (weightPtr[0][i] * src010 + weightPtr[1][i] * src011)) + + weightPtr[5][i] * (weightPtr[2][i] * (weightPtr[0][i] * src100 + weightPtr[1][i] * src101) + + weightPtr[3][i] * (weightPtr[0][i] * src110 + weightPtr[1][i] * src111)); + } + break; + default: + break; } }); } -void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { const int idxNum = 1; - int *xOrigin = static_cast(&auxTable[0]); - float *xFactor = reinterpret_cast(&auxTable[OW]); - int *yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); - float *yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); + int* xOrigin = static_cast(&auxTable[0]); + float* xFactor = reinterpret_cast(&auxTable[OW]); + int* yOrigin = static_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW]); + float* yFactor = reinterpret_cast(&auxTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]); - const float *in_ptr_f32 = reinterpret_cast(in_ptr_); - float *out_ptr_f32 = reinterpret_cast(out_ptr_); + const float* in_ptr_f32 = reinterpret_cast(in_ptr_); + float* out_ptr_f32 = reinterpret_cast(out_ptr_); parallel_for4d(B, C, OH, OW, [&](size_t n, size_t c, size_t oy, size_t ox) { - const float *in_ptr_nc = in_ptr_f32 + (IW * IH * C * n + IW * IH * c); - float *out_ptr_nc = out_ptr_f32 + (OW * OH * C * n + OW * OH * c); + const float* in_ptr_nc = in_ptr_f32 + (IW * IH * C * n + IW * IH * c); + float* out_ptr_nc = out_ptr_f32 + (OW * OH * C * n + OW * OH * c); int iy = yOrigin[oy]; int ix = xOrigin[ox]; @@ -3494,7 +3719,7 @@ void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t *in_ptr_, uint8 float retY = 0.f; for (int y = iy - 1, i = 0; y <= iy + 2; y++, i++) { int yInRange = std::max(0, std::min(y, IH - 1)); - const float *in_ptr_nch = in_ptr_nc + IW * yInRange; + const float* in_ptr_nch = in_ptr_nc + IW * yInRange; float retX = 0.f; for (int x = ix - 1, j = 0; x <= ix + 2; x++, j++) { int xInRange = std::max(0, std::min(x, IW - 1)); @@ -3506,66 +3731,79 @@ void Interpolate::InterpolateRefExecutor::cubicRef(const uint8_t *in_ptr_, uint8 }); } -float Interpolate::InterpolateRefExecutor::getValue(const uint8_t *base, size_t offset, ov::element::Type prec) { - const uint8_t *baseOffset = base + offset; +float Interpolate::InterpolateRefExecutor::getValue(const uint8_t* base, size_t offset, ov::element::Type prec) { + const uint8_t* baseOffset = base + offset; switch (prec) { - case ov::element::u8: { - return static_cast(*baseOffset); - break; - } - case ov::element::i8: { - const int8_t *valuePtr = reinterpret_cast(baseOffset); - return static_cast(*valuePtr); - break; - } - case ov::element::bf16: { - const uint16_t *valuePtr = reinterpret_cast(baseOffset); - return bfloat16_t::from_bits(*valuePtr); - break; - } - case ov::element::f32: { - const float *valuePtr = reinterpret_cast(baseOffset); - return *valuePtr; - break; - } - default: { - OPENVINO_THROW("Interpolate layer does not support precision: ", prec); - break; - } + case ov::element::u8: { + return static_cast(*baseOffset); + break; + } + case ov::element::i8: { + const int8_t* valuePtr = reinterpret_cast(baseOffset); + return static_cast(*valuePtr); + break; + } + case ov::element::bf16: { + const uint16_t* valuePtr = reinterpret_cast(baseOffset); + return bfloat16_t::from_bits(*valuePtr); + break; + } + case ov::element::f32: { + const float* valuePtr = reinterpret_cast(baseOffset); + return *valuePtr; + break; + } + default: { + OPENVINO_THROW("Interpolate layer does not support precision: ", prec); + break; + } } } -void Interpolate::InterpolateRefExecutor::setValue(uint8_t *base, size_t offset, float value, ov::element::Type prec) { - uint8_t *baseOffset = base + offset; +void Interpolate::InterpolateRefExecutor::setValue(uint8_t* base, size_t offset, float value, ov::element::Type prec) { + uint8_t* baseOffset = base + offset; switch (prec) { - case ov::element::u8: { - uint8_t data = static_cast(value < 0 ? 0 : value); - cpu_memcpy(baseOffset, &data, 1); - break; - } - case ov::element::i8: { - int8_t data = static_cast(value); - cpu_memcpy(baseOffset, &data, 1); - break; - } - case ov::element::bf16: { - uint16_t data = bfloat16_t(value).to_bits(); - cpu_memcpy(baseOffset, &data, 2); - break; - } - case ov::element::f32: { - cpu_memcpy(baseOffset, &value, sizeof(float)); - break; - } - default: { - OPENVINO_THROW("Interpolate layer does not support precision: ", prec); - break; - } + case ov::element::u8: { + uint8_t data = static_cast(value < 0 ? 0 : value); + cpu_memcpy(baseOffset, &data, 1); + break; + } + case ov::element::i8: { + int8_t data = static_cast(value); + cpu_memcpy(baseOffset, &data, 1); + break; + } + case ov::element::bf16: { + uint16_t data = bfloat16_t(value).to_bits(); + cpu_memcpy(baseOffset, &data, 2); + break; + } + case ov::element::f32: { + cpu_memcpy(baseOffset, &value, sizeof(float)); + break; + } + default: { + OPENVINO_THROW("Interpolate layer does not support precision: ", prec); + break; + } } } -void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias) { +void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + float fx, + float fy, + float fz, + int OD, + int OH, + int OW, + int kernel_width, + bool antialias) { if (IW == OW && IH == OH && ID == OD) { size_t spatialDimSize = IW * IH * ID; // TODO: enable when fusing into interp with linear mode will support @@ -3574,8 +3812,8 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ cpu_memcpy(out_ptr_, in_ptr_, size); } else { parallel_for2d(B, C, [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (spatialDimSize * C * b + spatialDimSize * c) * dstDataSize; for (size_t i = 0; i < spatialDimSize; i++) { float dstValue = getValue(in_ptr_nc, i * srcDataSize, inputPrec); setValue(out_ptr_nc, i * dstDataSize, dstValue, outputPrec); @@ -3600,23 +3838,23 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ int sizeOH = OH * diaOH; int sizeOW = OW * diaOW; - float *weightTable = reinterpret_cast(&auxTable[0]); - float *weightOD = static_cast(&weightTable[0]); - float *weightOH = static_cast(&weightTable[sizeOD]); - float *weightOW = static_cast(&weightTable[sizeOD + sizeOH]); + float* weightTable = reinterpret_cast(&auxTable[0]); + float* weightOD = static_cast(&weightTable[0]); + float* weightOH = static_cast(&weightTable[sizeOD]); + float* weightOW = static_cast(&weightTable[sizeOD + sizeOH]); - int *idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); - int *idxOD = static_cast(&idxTable[0]); - int *idxOH = static_cast(&idxTable[sizeOD]); - int *idxOW = static_cast(&idxTable[sizeOD + sizeOH]); + int* idxTable = static_cast(&auxTable[sizeOD + sizeOH + sizeOW]); + int* idxOD = static_cast(&idxTable[0]); + int* idxOH = static_cast(&idxTable[sizeOD]); + int* idxOW = static_cast(&idxTable[sizeOD + sizeOH]); parallel_for2d(B, C, [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c) * dstDataSize; + const uint8_t* in_ptr_nc = in_ptr_ + (IW * IH * ID * C * b + IW * IH * ID * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OW * OH * OD * C * b + OW * OH * OD * c) * dstDataSize; for (int oz = 0; oz < OD; oz++) { - uint8_t *out_ptr_ncd = out_ptr_nc + (OW * OH * oz) * dstDataSize; + uint8_t* out_ptr_ncd = out_ptr_nc + (OW * OH * oz) * dstDataSize; for (int oy = 0; oy < OH; oy++) { - uint8_t *out_ptr_ncdh = out_ptr_ncd + (OW * oy) * dstDataSize; + uint8_t* out_ptr_ncdh = out_ptr_ncd + (OW * oy) * dstDataSize; for (int ox = 0; ox < OW; ox++) { float sum = 0.f; float wsum = 0.f; @@ -3659,9 +3897,13 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ if (weightOW[ox * diaOW + ix] == 0.f) { continue; } - float w = weightOD[oz * diaOD + iz] * weightOH[oy * diaOH + iy] * weightOW[ox * diaOW + ix]; + float w = + weightOD[oz * diaOD + iz] * weightOH[oy * diaOH + iy] * weightOW[ox * diaOW + ix]; float value = getValue(in_ptr_nc, - (idxOD[oz * diaOD + iz] * IH * IW + idxOH[oy * diaOH + iy] * IW + idxOW[ox * diaOW + ix]) * srcDataSize, inputPrec); + (idxOD[oz * diaOD + iz] * IH * IW + idxOH[oy * diaOH + iy] * IW + + idxOW[ox * diaOW + ix]) * + srcDataSize, + inputPrec); sum += w * value; wsum += w; @@ -3681,18 +3923,25 @@ void Interpolate::InterpolateRefExecutor::linearInterpolation(const uint8_t *in_ }); } -void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW) { +void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int IH, + int IW, + int OH, + int OW) { size_t offset = 0; int filterLenX = auxTable[offset]; int filterLenY = auxTable[offset + 1]; offset += 2; - float *weightX = reinterpret_cast(&auxTable[offset]); + float* weightX = reinterpret_cast(&auxTable[offset]); offset += filterLenX * OW; - float *weightY = reinterpret_cast(&auxTable[offset]); + float* weightY = reinterpret_cast(&auxTable[offset]); offset += filterLenY * OH; - int *indexX = static_cast(&auxTable[offset]); + int* indexX = static_cast(&auxTable[offset]); offset += 2 * OW; - int *indexY = static_cast(&auxTable[offset]); + int* indexY = static_cast(&auxTable[offset]); // workBuffer needed when both pass is true bool xPass = IW != OW; @@ -3710,21 +3959,24 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint // | | // ---- auto bc_loop = [&](size_t b, size_t c) { - const uint8_t *in_ptr_nc = in_ptr_ + (IW * IH * C * b + IW * IH * c) * srcDataSize; - uint8_t *out_ptr_nc = out_ptr_ + (OW * OH * C * b + OW * OH * c) * dstDataSize; - uint8_t *xpass_out_ptr_nc = nullptr; - const uint8_t *ypass_in_ptr_nc = nullptr; + const uint8_t* in_ptr_nc = in_ptr_ + (IW * IH * C * b + IW * IH * c) * srcDataSize; + uint8_t* out_ptr_nc = out_ptr_ + (OW * OH * C * b + OW * OH * c) * dstDataSize; + uint8_t* xpass_out_ptr_nc = nullptr; + const uint8_t* ypass_in_ptr_nc = nullptr; if (xPass && yPass) { size_t parallel_num = B * C; // IH * OW buf needed if (parallel_num < m_threads_num) { - xpass_out_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); - ypass_in_ptr_nc = static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); + xpass_out_ptr_nc = + static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); + ypass_in_ptr_nc = + static_cast(&pillow_working_buf[(OW * IH * C * b + OW * IH * c) * srcDataSize]); } else { size_t threadsIdx = parallel_get_thread_num(); size_t buffer_size = static_cast(OW * IH); xpass_out_ptr_nc = static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); - ypass_in_ptr_nc = static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); + ypass_in_ptr_nc = + static_cast(&pillow_working_buf[threadsIdx * buffer_size * srcDataSize]); } } else if (xPass && !yPass) { xpass_out_ptr_nc = out_ptr_nc; @@ -3782,14 +4034,14 @@ void Interpolate::InterpolateRefExecutor::pillowRef(const uint8_t *in_ptr_, uint void Interpolate::InterpolateExecutorBase::create_pillow_working_buf(InterpolateLayoutType layout) { if (srcDimPad5d[3] == dstDim5d[3] || srcDimPad5d[4] == dstDim5d[4]) return; - size_t bufSize = srcDimPad5d[3] * dstDim5d[4] * srcDataSize; // IH * OW + size_t bufSize = srcDimPad5d[3] * dstDim5d[4] * srcDataSize; // IH * OW m_threads_num = parallel_get_max_threads(); if (layout == InterpolateLayoutType::planar) { // B and C execute in parallel, need separate buf size_t parallel_num = srcDimPad5d[0] * srcDimPad5d[1]; bufSize *= std::min(m_threads_num, parallel_num); } else { - bufSize *= srcDimPad5d[1]; // *C + bufSize *= srcDimPad5d[1]; // *C // B execute in parallel, need separate buf size_t parallel_num = srcDimPad5d[0]; bufSize *= std::min(m_threads_num, parallel_num); @@ -3798,11 +4050,14 @@ void Interpolate::InterpolateExecutorBase::create_pillow_working_buf(Interpolate } Interpolate::InterpolateExecutorBase::InterpolateExecutorBase(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales) : - mode(interpAttrs.mode), coordTransMode(interpAttrs.coordTransMode), configured_for_layout(interpAttrs.layout), - inputPrec(interpAttrs.inPrc), outputPrec(interpAttrs.outPrc) { + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales) + : mode(interpAttrs.mode), + coordTransMode(interpAttrs.coordTransMode), + configured_for_layout(interpAttrs.layout), + inputPrec(interpAttrs.inPrc), + outputPrec(interpAttrs.outPrc) { srcDimPad5d = to5Dim(getPaddedInputShape(srcDims, interpAttrs.padBegin, interpAttrs.padEnd)); dstDim5d = to5Dim(dstDims); srcDataSize = interpAttrs.inPrc.size(); @@ -3811,44 +4066,44 @@ Interpolate::InterpolateExecutorBase::InterpolateExecutorBase(const InterpolateA spatialDimSize = getSpatialDimsNum(dataRank); switch (mode) { - case InterpolateMode::nearest: { - buildTblNN(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout, interpAttrs.nearestMode); - break; - } - case InterpolateMode::linear_onnx: { - buildTblLinearOnnx(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout); - break; - } - case InterpolateMode::linear: { - static constexpr int LINEAR_KERNEL = 2; - buildTblLinear(srcDimPad5d, dstDim5d, dataScales, LINEAR_KERNEL, interpAttrs.antialias); - break; - } - case InterpolateMode::cubic: { - buildTblCubic(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); - break; - } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - buildTblPillow(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); - if ((srcDimPad5d[4] != dstDim5d[4]) && (srcDimPad5d[3] != dstDim5d[3])) { - create_pillow_working_buf(interpAttrs.layout); - } - break; - } - default: { - OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", mode); - break; + case InterpolateMode::nearest: { + buildTblNN(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout, interpAttrs.nearestMode); + break; + } + case InterpolateMode::linear_onnx: { + buildTblLinearOnnx(srcDimPad5d, dstDim5d, dataScales, interpAttrs.layout); + break; + } + case InterpolateMode::linear: { + static constexpr int LINEAR_KERNEL = 2; + buildTblLinear(srcDimPad5d, dstDim5d, dataScales, LINEAR_KERNEL, interpAttrs.antialias); + break; + } + case InterpolateMode::cubic: { + buildTblCubic(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + buildTblPillow(srcDimPad5d, dstDim5d, dataScales, interpAttrs.cubeCoeff, interpAttrs.layout); + if ((srcDimPad5d[4] != dstDim5d[4]) && (srcDimPad5d[3] != dstDim5d[3])) { + create_pillow_working_buf(interpAttrs.layout); } + break; + } + default: { + OPENVINO_THROW("Interpolate executor does not support interpolate mode: ", mode); + break; + } } } Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales, - const dnnl::primitive_attr &attr) : - InterpolateExecutorBase(interpAttrs, srcDims, dstDims, dataScales) { + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales, + const dnnl::primitive_attr& attr) + : InterpolateExecutorBase(interpAttrs, srcDims, dstDims, dataScales) { auto jcp = jit_interpolate_config_params(); jcp.mode = mode; jcp.src_prc = interpAttrs.inPrc; @@ -3885,7 +4140,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt } else { OPENVINO_THROW("Can't create InterpolateJitExecutor"); } -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 if (interpolateKernel) { interpolateKernel->create_ker(); } else { @@ -3893,7 +4148,7 @@ Interpolate::InterpolateJitExecutor::InterpolateJitExecutor(const InterpolateAtt } } -void Interpolate::InterpolateJitExecutor::exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) { +void Interpolate::InterpolateJitExecutor::exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) { size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; @@ -3901,103 +4156,115 @@ void Interpolate::InterpolateJitExecutor::exec(const uint8_t *in_ptr_, uint8_t * OPENVINO_THROW("Can't execute, kernel for Interpolate node is not compiled"); } switch (mode) { - case InterpolateMode::nearest: { - if (configured_for_layout == InterpolateLayoutType::planar) { - NNPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } else { - NNCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } - break; - } - case InterpolateMode::linear_onnx: { - if (configured_for_layout == InterpolateLayoutType::planar) { - linearOnnxPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } else { - linearOnnxCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); - } - break; + case InterpolateMode::nearest: { + if (configured_for_layout == InterpolateLayoutType::planar) { + NNPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } else { + NNCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); } - case InterpolateMode::cubic: { - if (configured_for_layout == InterpolateLayoutType::planar) { - cubicPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } else { - cubicCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } - break; + break; + } + case InterpolateMode::linear_onnx: { + if (configured_for_layout == InterpolateLayoutType::planar) { + linearOnnxPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); + } else { + linearOnnxCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, ID, IH, IW, OD, OH, OW); } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - if (configured_for_layout == InterpolateLayoutType::by_channel) { - pillowCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); - } else { - OPENVINO_THROW("Only channel_first jit kernel is supported for pillow mode", mode); - } - break; + break; + } + case InterpolateMode::cubic: { + if (configured_for_layout == InterpolateLayoutType::planar) { + cubicPlanar(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); + } else { + cubicCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); } - default: { - OPENVINO_THROW("InterpolateJitExecutor has unsupported interpolate mode: ", mode); + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + if (configured_for_layout == InterpolateLayoutType::by_channel) { + pillowCGathered(in_ptr_, out_ptr_, post_ops_data_, N, C, IH, IW, OH, OW); + } else { + OPENVINO_THROW("Only channel_first jit kernel is supported for pillow mode", mode); } + break; + } + default: { + OPENVINO_THROW("InterpolateJitExecutor has unsupported interpolate mode: ", mode); + } } } -void Interpolate::InterpolateRefExecutor::exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) { +void Interpolate::InterpolateRefExecutor::exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) { size_t N = srcDimPad5d[0], C = srcDimPad5d[1], ID = srcDimPad5d[2], IH = srcDimPad5d[3], IW = srcDimPad5d[4]; size_t OD = dstDim5d[2], OH = dstDim5d[3], OW = dstDim5d[4]; switch (mode) { - case InterpolateMode::nearest: { - NNRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); - break; - } - case InterpolateMode::linear_onnx: { - linearOnnxRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); - break; - } - case InterpolateMode::cubic: { - cubicRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); - break; - } - case InterpolateMode::linear: { - float fz = (dataRank == 5) ? dataScales[dataRank - 3] : 1.f; - float fy = dataScales[dataRank - 2]; - float fx = dataScales[dataRank - 1]; - - bool isDownsample = (fx < 1.f) || (fy < 1.f) || (fz < 1.f); - int kernel_width = 2; - linearInterpolation(in_ptr_, out_ptr_, N, C, ID, IH, IW, fx, fy, fz, OD, OH, OW, kernel_width, isDownsample && antialias); - break; - } - case InterpolateMode::bilinear_pillow: - case InterpolateMode::bicubic_pillow: { - pillowRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); - break; - } - default: { - OPENVINO_THROW("Interpolate layer has unsupported interpolate mode: ", mode); - } + case InterpolateMode::nearest: { + NNRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); + break; + } + case InterpolateMode::linear_onnx: { + linearOnnxRef(in_ptr_, out_ptr_, N, C, ID, IH, IW, OD, OH, OW); + break; + } + case InterpolateMode::cubic: { + cubicRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); + break; + } + case InterpolateMode::linear: { + float fz = (dataRank == 5) ? dataScales[dataRank - 3] : 1.f; + float fy = dataScales[dataRank - 2]; + float fx = dataScales[dataRank - 1]; + + bool isDownsample = (fx < 1.f) || (fy < 1.f) || (fz < 1.f); + int kernel_width = 2; + linearInterpolation(in_ptr_, + out_ptr_, + N, + C, + ID, + IH, + IW, + fx, + fy, + fz, + OD, + OH, + OW, + kernel_width, + isDownsample && antialias); + break; + } + case InterpolateMode::bilinear_pillow: + case InterpolateMode::bicubic_pillow: { + pillowRef(in_ptr_, out_ptr_, N, C, IH, IW, OH, OW); + break; + } + default: { + OPENVINO_THROW("Interpolate layer has unsupported interpolate mode: ", mode); + } } } size_t Interpolate::getSpatialDimsNum(const Dim rank) { switch (rank) { - case 1: - case 3: - return 1; - case 2: - case 4: - return 2; - case 5: - return 3; - default: - OPENVINO_THROW("Can't define number spatial"); + case 1: + case 3: + return 1; + case 2: + case 4: + return 2; + case 5: + return 3; + default: + OPENVINO_THROW("Can't define number spatial"); } } bool Interpolate::canFuse(const NodePtr& node) const { - if (!mayiuse(cpu::x64::sse41) || - interpAttrs.mode == InterpolateMode::linear || - interpAttrs.mode == InterpolateMode::bilinear_pillow || - interpAttrs.mode == InterpolateMode::bicubic_pillow || + if (!mayiuse(cpu::x64::sse41) || interpAttrs.mode == InterpolateMode::linear || + interpAttrs.mode == InterpolateMode::bilinear_pillow || interpAttrs.mode == InterpolateMode::bicubic_pillow || (!one_of(dataRank, 4u, 5u) && !mayiuse(cpu::x64::avx2))) { return false; } @@ -4009,6 +4276,6 @@ bool Interpolate::created() const { return getType() == Type::Interpolate; } -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.h b/src/plugins/intel_cpu/src/nodes/interpolate.h index a43b354aa0306a..c6fedf384f449d 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.h +++ b/src/plugins/intel_cpu/src/nodes/interpolate.h @@ -31,34 +31,36 @@ struct jit_interpolate_config_params { }; struct jit_interpolate_call_args { - const void *src_ptr[MAX_INPUT_INTERPOLATE]; - const void *weight_ptr[MAX_INPUT_INTERPOLATE]; - const int *index; - void *dst; + const void* src_ptr[MAX_INPUT_INTERPOLATE]; + const void* weight_ptr[MAX_INPUT_INTERPOLATE]; + const int* index; + void* dst; size_t work_amount; size_t oc_off; - //ptr to array of post op inputs pointers (flat list) + // ptr to array of post op inputs pointers (flat list) const void* post_op_data; }; struct jit_uni_interpolate_kernel { - void (*ker_)(const jit_interpolate_call_args *); + void (*ker_)(const jit_interpolate_call_args*); - void operator()(const jit_interpolate_call_args *args) { + void operator()(const jit_interpolate_call_args* args) { assert(ker_); ker_(args); } - explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const dnnl_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {} + explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const dnnl_primitive_attr& attr) + : ker_(nullptr), + jcp_(jcp), + attr_(attr) {} virtual ~jit_uni_interpolate_kernel() {} virtual void create_ker() = 0; jit_interpolate_config_params jcp_; - const dnnl_primitive_attr &attr_; + const dnnl_primitive_attr& attr_; }; - class Interpolate : public Node { public: static constexpr size_t DATA_ID = 0; @@ -98,8 +100,9 @@ class Interpolate : public Node { bool is_version11 = true; InterpolateAttrs interpAttrs; // Some FEs or preprocessing step resize spatial dimension for tensor with NHWC layout memory, - // but imported as planar layout[abcd] with axis[1,2] for convenience. In this case, for pillow modes without pad for now, - // nhwc layout path and the kernel(nhwc layout executor) can be used for this planar layout and axis settings(NCHWAsNHWC is true) to get higher perf with + // but imported as planar layout[abcd] with axis[1,2] for convenience. In this case, for pillow modes without pad + // for now, nhwc layout path and the kernel(nhwc layout executor) can be used for this planar layout and axis + // settings(NCHWAsNHWC is true) to get higher perf with // 1. logical shape alignment [abcd-nhwc] to [adbc-nchw]. // 2. axis alignment [1,2] to [2,3]. // 3. config planar layout support and treated it as channel_first layout. @@ -107,120 +110,226 @@ class Interpolate : public Node { size_t dataRank = 0; class InterpolateExecutorBase { - public: - InterpolateExecutorBase(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales); - - virtual void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) = 0; - virtual ~InterpolateExecutorBase() = default; - VectorDims getSrcDimPad5d() const { return srcDimPad5d; } - - private: - void buildTblNN(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout, InterpolateNearestMode nearestMode); - void buildTblLinearOnnx(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - InterpolateLayoutType layout); - void buildTblLinear(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, int kernel_width, - bool antialias); - void buildTblCubic(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, float cubicCoeff, - InterpolateLayoutType layout); - void buildTblPillow(const VectorDims& srcDimPad5d, const VectorDims& dstDim5d, const std::vector& dataScales, - float cubicCoeff, InterpolateLayoutType layout); - - float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; - int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; - void linearOnnxCF(int outCoord, float scale, int inShape, int outShape, int& index0, int& index1, float& weight0, float& weight1); - std::vector getCubicCoeffs(float mantissa, float a); - static float getPillowBilinearCoeffs(float m); - static float getPillowBicubicCoeffs(float m); - inline void create_pillow_working_buf(InterpolateLayoutType layout); - - protected: - InterpolateMode mode; - InterpolateCoordTransMode coordTransMode; - InterpolateLayoutType configured_for_layout; - VectorDims srcDimPad5d, dstDim5d; - ov::element::Type inputPrec, outputPrec; - size_t srcDataSize, dstDataSize; - int spatialDimSize; - size_t dataRank; - std::vector auxTable; - std::vector pillow_working_buf; - size_t m_threads_num = 0lu; + public: + InterpolateExecutorBase(const InterpolateAttrs& interpAttrs, + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales); + + virtual void exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) = 0; + virtual ~InterpolateExecutorBase() = default; + VectorDims getSrcDimPad5d() const { + return srcDimPad5d; + } + + private: + void buildTblNN(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout, + InterpolateNearestMode nearestMode); + void buildTblLinearOnnx(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + InterpolateLayoutType layout); + void buildTblLinear(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + int kernel_width, + bool antialias); + void buildTblCubic(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout); + void buildTblPillow(const VectorDims& srcDimPad5d, + const VectorDims& dstDim5d, + const std::vector& dataScales, + float cubicCoeff, + InterpolateLayoutType layout); + + float coordTransToInput(int outCoord, float scale, int inShape, int outShape) const; + int nearestRound(float origin, bool isDownsample, InterpolateNearestMode nearestMode) const; + void linearOnnxCF(int outCoord, + float scale, + int inShape, + int outShape, + int& index0, + int& index1, + float& weight0, + float& weight1); + std::vector getCubicCoeffs(float mantissa, float a); + static float getPillowBilinearCoeffs(float m); + static float getPillowBicubicCoeffs(float m); + inline void create_pillow_working_buf(InterpolateLayoutType layout); + + protected: + InterpolateMode mode; + InterpolateCoordTransMode coordTransMode; + InterpolateLayoutType configured_for_layout; + VectorDims srcDimPad5d, dstDim5d; + ov::element::Type inputPrec, outputPrec; + size_t srcDataSize, dstDataSize; + int spatialDimSize; + size_t dataRank; + std::vector auxTable; + std::vector pillow_working_buf; + size_t m_threads_num = 0lu; }; std::shared_ptr execPtr = nullptr; class InterpolateJitExecutor : public InterpolateExecutorBase { - public: - InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &dataScales, - const dnnl::primitive_attr &attr); - - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; - - private: - // nearest neighbor - void NNPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - // onnx linear - void linearOnnxPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - // cubic - void cubicPlanar(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - void cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - - // pillow bilinear and pillow bicubic - void pillowCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, - int B, int C, int IH, int IW, int OH, int OW); - - private: - std::shared_ptr interpolateKernel = nullptr; + public: + InterpolateJitExecutor(const InterpolateAttrs& interpAttrs, + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& dataScales, + const dnnl::primitive_attr& attr); + + void exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) override; + + private: + // nearest neighbor + void NNPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + void NNCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + + // onnx linear + void linearOnnxPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + void linearOnnxCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + + // cubic + void cubicPlanar(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW); + void cubicCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW); + + // pillow bilinear and pillow bicubic + void pillowCGathered(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + const void* post_ops_data_, + int B, + int C, + int IH, + int IW, + int OH, + int OW); + + private: + std::shared_ptr interpolateKernel = nullptr; }; class InterpolateRefExecutor : public InterpolateExecutorBase { - public: - InterpolateRefExecutor(const InterpolateAttrs& interpAttrs, - const VectorDims &srcDims, - const VectorDims &dstDims, - const std::vector &_dataScales) : - InterpolateExecutorBase(interpAttrs, srcDims, dstDims, _dataScales), - antialias(interpAttrs.antialias), dataScales(_dataScales) {} - - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; - - private: - void NNRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - void linearOnnxRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); - - void cubicRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW); - void linearInterpolation(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int ID, int IH, int IW, - float fx, float fy, float fz, int OD, int OH, int OW, int kernel_width, bool antialias); - void pillowRef(const uint8_t *in_ptr_, uint8_t *out_ptr_, int B, int C, int IH, int IW, int OH, int OW); - - static float getValue(const uint8_t *base, size_t offset, ov::element::Type prec); - static void setValue(uint8_t *base, size_t offset, float value, ov::element::Type prec); - - private: - bool antialias; - std::vector dataScales; + public: + InterpolateRefExecutor(const InterpolateAttrs& interpAttrs, + const VectorDims& srcDims, + const VectorDims& dstDims, + const std::vector& _dataScales) + : InterpolateExecutorBase(interpAttrs, srcDims, dstDims, _dataScales), + antialias(interpAttrs.antialias), + dataScales(_dataScales) {} + + void exec(const uint8_t* in_ptr_, uint8_t* out_ptr_, const void* post_ops_data_) override; + + private: + void + NNRef(const uint8_t* in_ptr_, uint8_t* out_ptr_, int B, int C, int ID, int IH, int IW, int OD, int OH, int OW); + void linearOnnxRef(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + int OD, + int OH, + int OW); + + void cubicRef(const uint8_t* in_ptr_, uint8_t* out_ptr_, int B, int C, int IH, int IW, int OH, int OW); + void linearInterpolation(const uint8_t* in_ptr_, + uint8_t* out_ptr_, + int B, + int C, + int ID, + int IH, + int IW, + float fx, + float fy, + float fz, + int OD, + int OH, + int OW, + int kernel_width, + bool antialias); + void pillowRef(const uint8_t* in_ptr_, uint8_t* out_ptr_, int B, int C, int IH, int IW, int OH, int OW); + + static float getValue(const uint8_t* base, size_t offset, ov::element::Type prec); + static void setValue(uint8_t* base, size_t offset, float value, ov::element::Type prec); + + private: + bool antialias; + std::vector dataScales; }; - void setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims); + void setPostOps(dnnl::primitive_attr& attr, const VectorDims& dims); - static VectorDims getPaddedInputShape(const VectorDims &srcDims, const std::vector &padBegin, const std::vector &padEnd); - std::vector getScales(const VectorDims &srcDimPad, const VectorDims &dstDim); + static VectorDims getPaddedInputShape(const VectorDims& srcDims, + const std::vector& padBegin, + const std::vector& padEnd); + std::vector getScales(const VectorDims& srcDimPad, const VectorDims& dstDim); static size_t getSpatialDimsNum(const Dim rank); bool hasPad = false; @@ -244,6 +353,6 @@ class Interpolate : public Node { std::shared_ptr aclExecPtr = nullptr; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov +} // namespace node +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 7ac3b603353541..cfe36f78cc40f9 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -648,6 +648,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseEqual, ov::intel_cpu::aarch64::jit_equal_emitter), OV_CASE(Algorithm::EltwiseExp, ov::intel_cpu::aarch64::jit_exp_emitter), OV_CASE(Algorithm::EltwiseFloor, ov::intel_cpu::aarch64::jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, ov::intel_cpu::aarch64::jit_floor_mod_emitter), OV_CASE(Algorithm::EltwiseCeiling, ov::intel_cpu::aarch64::jit_ceiling_emitter), OV_CASE(Algorithm::EltwiseHswish, ov::intel_cpu::aarch64::jit_hswish_emitter), OV_CASE(Algorithm::EltwiseIsFinite, ov::intel_cpu::aarch64::jit_is_finite_emitter), @@ -670,6 +671,8 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwisePowerStatic, ov::intel_cpu::aarch64::jit_power_static_emitter), OV_CASE(Algorithm::EltwisePrelu, ov::intel_cpu::aarch64::jit_prelu_emitter), OV_CASE(Algorithm::EltwiseRelu, ov::intel_cpu::aarch64::jit_relu_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, ov::intel_cpu::aarch64::jit_round_half_away_from_zero_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, ov::intel_cpu::aarch64::jit_round_half_to_even_emitter), OV_CASE(Algorithm::EltwiseSelect, ov::intel_cpu::aarch64::jit_select_emitter), OV_CASE(Algorithm::EltwiseSigmoid, ov::intel_cpu::aarch64::jit_sigmoid_emitter), OV_CASE(Algorithm::EltwiseSoftSign, ov::intel_cpu::aarch64::jit_soft_sign_emitter), @@ -830,6 +833,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), @@ -851,6 +855,8 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseMultiply, jit_multiply_emitter), OV_CASE(Algorithm::EltwisePrelu, jit_prelu_emitter), OV_CASE(Algorithm::EltwisePowerStatic, jit_power_static_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfAwayFromZero, jit_round_half_away_from_zero_emitter), + OV_CASE(Algorithm::EltwiseRoundHalfToEven, jit_round_half_to_even_emitter), OV_CASE(Algorithm::EltwiseSelect, jit_select_emitter), OV_CASE(Algorithm::EltwiseSigmoid, jit_sigmoid_emitter), OV_CASE(Algorithm::EltwiseSoftSign, jit_soft_sign_emitter), diff --git a/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp b/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp index 7370bb824d8c62..66f119ee839b14 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/jit_eltwise_call_args_ptrs.hpp @@ -9,21 +9,21 @@ namespace ov { namespace intel_cpu { namespace node { -#define MAX_ELTWISE_INPUTS 7 +#define MAX_ELTWISE_INPUTS 7 #define MAX_ELTWISE_DIM_RANK 12 struct jit_eltwise_call_args_ptrs { - const void *src_ptr[MAX_ELTWISE_INPUTS]; - void *dst_ptr; - //ptr to array of post op inputs pointers (flat list) + const void* src_ptr[MAX_ELTWISE_INPUTS]; + void* dst_ptr; + // ptr to array of post op inputs pointers (flat list) const void** post_op_data; // shape agnostic kernel size_t work_amount; - const void *src_offsets[MAX_ELTWISE_INPUTS]; - const void *dst_offsets; + const void* src_offsets[MAX_ELTWISE_INPUTS]; + const void* dst_offsets; }; -} // namespace node -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace node +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp index 755330bd850c4d..b4d38086cefe8a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.cpp @@ -13,10 +13,10 @@ # include #endif -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/parallel.hpp" -#include "common.hpp" #include "attn_memcpy.hpp" +#include "common.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" namespace ov { namespace Extensions { @@ -26,7 +26,7 @@ namespace XARCH { using namespace ov; // float16 <- float -template +template void attn_copy(TA* a, TB* b, size_t n) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -51,14 +51,11 @@ void attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output) { // For compatibility, all input_kvs are permuted to BHLS - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) { - attn_copy(past_k_output.ptr(b, h, m, 0), - k_input.ptr(b, h, m, 0), - S); - attn_copy(past_v_output.ptr(b, h, m, 0), - v_input.ptr(b, h, m, 0), - SV); + attn_copy(past_k_output.ptr(b, h, m, 0), k_input.ptr(b, h, m, 0), S); + attn_copy(past_v_output.ptr(b, h, m, 0), v_input.ptr(b, h, m, 0), SV); }); } @@ -67,14 +64,11 @@ static void attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output) { // For compatibility, all input_kvs are permuted to BHLS - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) { - std::memcpy(past_k_output.ptr_v(b, h, m, 0), - k_input.ptr_v(b, h, m, 0), - S * k_input.m_element_size); - std::memcpy(past_v_output.ptr_v(b, h, m, 0), - v_input.ptr_v(b, h, m, 0), - SV * v_input.m_element_size); + std::memcpy(past_k_output.ptr_v(b, h, m, 0), k_input.ptr_v(b, h, m, 0), S * k_input.m_element_size); + std::memcpy(past_v_output.ptr_v(b, h, m, 0), v_input.ptr_v(b, h, m, 0), SV * v_input.m_element_size); }); } @@ -84,19 +78,17 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output, const ov::intel_cpu::PlainTensor& slot_mapping) { - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; size_t block_size = past_k_output.m_dims[2]; parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) { auto slot = slot_mapping.ptr(b)[m]; - if (slot < 0) return; + if (slot < 0) + return; auto block_number = slot / block_size; auto block_offset = slot % block_size; - attn_copy(past_k_output.ptr(block_number, h, block_offset, 0), - k_input.ptr(b, h, m, 0), - S); - attn_copy(past_v_output.ptr(block_number, h, block_offset, 0), - v_input.ptr(b, h, m, 0), - SV); + attn_copy(past_k_output.ptr(block_number, h, block_offset, 0), k_input.ptr(b, h, m, 0), S); + attn_copy(past_v_output.ptr(block_number, h, block_offset, 0), v_input.ptr(b, h, m, 0), SV); }); } @@ -105,11 +97,13 @@ static void paged_attn_memcpy_kernel(const ov::intel_cpu::PlainTensor& k_input, const ov::intel_cpu::PlainTensor& past_k_output, const ov::intel_cpu::PlainTensor& past_v_output, const ov::intel_cpu::PlainTensor& slot_mapping) { - size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], SV = v_input.m_dims[3]; + size_t B = k_input.m_dims[0], H = k_input.m_dims[1], L1 = k_input.m_dims[2], S = k_input.m_dims[3], + SV = v_input.m_dims[3]; size_t block_size = past_k_output.m_dims[2]; parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) { auto slot = slot_mapping.ptr(b)[m]; - if (slot < 0) return; + if (slot < 0) + return; auto block_number = slot / block_size; auto block_offset = slot % block_size; std::memcpy(past_k_output.ptr_v(block_number, h, block_offset, 0), @@ -132,7 +126,11 @@ void attn_memcpy(const ov::intel_cpu::PlainTensor& k_input, } else if (k_input.get_precision() == ov::element::f32 && past_k_output.get_precision() == ov::element::bf16) { attn_memcpy_kernel(k_input, v_input, past_k_output, past_v_output); } else { - OPENVINO_THROW("unsupport src type: ", k_input.get_precision(), ", dst type: ", past_k_output.get_precision(), " in attn_memcpy"); + OPENVINO_THROW("unsupport src type: ", + k_input.get_precision(), + ", dst type: ", + past_k_output.get_precision(), + " in attn_memcpy"); } } @@ -148,7 +146,11 @@ void paged_attn_memcpy(const ov::intel_cpu::PlainTensor& k_input, } else if (k_input.get_precision() == ov::element::f32 && past_k_output.get_precision() == ov::element::bf16) { paged_attn_memcpy_kernel(k_input, v_input, past_k_output, past_v_output, slot_mapping); } else { - OPENVINO_THROW("unsupport src type: ", k_input.get_precision(), ", dst type: ", past_k_output.get_precision(), " in paged_attn_memcpy"); + OPENVINO_THROW("unsupport src type: ", + k_input.get_precision(), + ", dst type: ", + past_k_output.get_precision(), + " in paged_attn_memcpy"); } } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp index c0e5892db9926b..ea704232e333bd 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_memcpy.hpp @@ -7,6 +7,7 @@ #include #include #include + #include "openvino/core/type/element_type.hpp" #include "utils/plain_tensor.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp index 66772bda03db51..095180d659142e 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.cpp @@ -13,11 +13,11 @@ # include #endif -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/parallel.hpp" -#include "common.hpp" #include "attn_quant.hpp" #include "attn_quant_kernel.hpp" +#include "common.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" namespace ov { namespace Extensions { @@ -26,7 +26,7 @@ namespace XARCH { using namespace ov; -template +template static void quant_u8(const T* src, uint8_t* dst, size_t n, float& scale, float& zp) { size_t i = 0; float max = -FLT_MAX; @@ -182,16 +182,8 @@ static void attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src, parallel_for3d(L1, B, H, [&](size_t m, size_t b, size_t h) { auto p_k = k_scale_zp.ptr(m, b, h); auto p_v = v_scale_zp.ptr(m, b, h); - quant_u8(k_src.ptr(b, h, m), - k_dst.ptr(b, h, m), - S, - p_k[0], - p_k[1]); - quant_u8(v_src.ptr(b, h, m), - v_dst.ptr(b, h, m), - SV, - p_v[0], - p_v[1]); + quant_u8(k_src.ptr(b, h, m), k_dst.ptr(b, h, m), S, p_k[0], p_k[1]); + quant_u8(v_src.ptr(b, h, m), v_dst.ptr(b, h, m), SV, p_v[0], p_v[1]); }); } @@ -205,14 +197,16 @@ static void paged_attn_quant_mt(const ov::intel_cpu::PlainTensor& k_src, size_t block_size = k_dst.m_dims[2]; parallel_for3d(B, L1, H, [&](size_t b, size_t m, size_t h) { auto slot = slot_mapping.ptr(b)[m]; - if (slot < 0) return; + if (slot < 0) + return; auto block_number = slot / block_size; auto block_offset = slot % block_size; auto p_k = reinterpret_cast(k_dst.ptr(block_number, h, block_offset)); auto p_v = reinterpret_cast(v_dst.ptr(block_number, h, block_offset)); // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| quant_u8(k_src.ptr(b, h, m), k_dst.ptr(block_number, h, block_offset) + sizeof(float) + sizeof(float), S, @@ -239,7 +233,11 @@ void attn_quantkv(const ov::intel_cpu::PlainTensor& k_src, } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) { attn_quant_mt(k_src, v_src, k_dst, v_dst, k_scale_zp, v_scale_zp); } else { - OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in attn_quantkv"); + OPENVINO_THROW("unsupport src type: ", + k_src.get_precision(), + ", dst type: ", + k_dst.get_precision(), + " in attn_quantkv"); } } @@ -255,7 +253,11 @@ void paged_attn_quantkv(const ov::intel_cpu::PlainTensor& k_src, } else if (k_src.get_precision() == ov::element::f16 && k_dst.get_precision() == ov::element::u8) { paged_attn_quant_mt(k_src, v_src, k_dst, v_dst, slot_mapping); } else { - OPENVINO_THROW("unsupport src type: ", k_src.get_precision(), ", dst type: ", k_dst.get_precision(), " in paged_attn_quantkv"); + OPENVINO_THROW("unsupport src type: ", + k_src.get_precision(), + ", dst type: ", + k_dst.get_precision(), + " in paged_attn_quantkv"); } } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp index ca930a1055db2b..2f39f74f5b3460 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant.hpp @@ -7,6 +7,7 @@ #include #include #include + #include "openvino/core/type/element_type.hpp" #include "utils/plain_tensor.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp index 4e013a004d29f9..759d0005103871 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/attn_quant_kernel.hpp @@ -3,19 +3,21 @@ // #pragma once -#include +#include "nodes/kernels/scaled_attn/common.hpp" + +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# include +#endif + #include #include -#include -#include "openvino/core/type/element_type.hpp" -#include "utils/plain_tensor.hpp" namespace ov { namespace Extensions { namespace Cpu { namespace XARCH { -template +template void attn_dequant_u8_kernel(const uint8_t* src, TDST* dst, size_t n, float scale, float zp) { size_t i = 0; // loadu_si128/epi64 does not support const qualifier @@ -53,4 +55,4 @@ void attn_dequant_u8_kernel(const uint8_t* src, TDST* dst, size_t n, float scale } // namespace XARCH } // namespace Cpu } // namespace Extensions -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp index 2956c8a6a6b5b8..4e14cf5894b04d 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp @@ -4,16 +4,20 @@ #pragma once #include +#include #include #include #include -#include #include "openvino/core/type/bfloat16.hpp" #include "openvino/core/type/float16.hpp" +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# include +#endif + #if defined(OPENVINO_ARCH_ARM64) -#include "arm_neon.h" +# include "arm_neon.h" #endif namespace ov { @@ -32,307 +36,307 @@ static constexpr size_t vec_len_f32_neon = vec_len_neon / sizeof(float); static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16); #ifdef HAVE_AVX512F - inline __m512 cvt_bf16_to_fp32(const __m256i src) { - __m512i y = _mm512_cvtepu16_epi32(src); - return _mm512_castsi512_ps(_mm512_slli_epi32(y, 16)); - } - - // load addr to __m512 reg - inline __m512 mm512_uni_loadu_ps(const float* a) { - return _mm512_loadu_ps(a); - } - - inline __m512 mm512_uni_loadu_ps(const ov::bfloat16* a) { - auto vec_bf16 = _mm256_loadu_si256(reinterpret_cast(a)); - return cvt_bf16_to_fp32(vec_bf16); - } - - inline __m512 mm512_uni_loadu_ps(const ov::float16* a) { - auto vec_f16 = _mm256_loadu_si256(reinterpret_cast(a)); - return _mm512_cvtph_ps(vec_f16); - } - - // load addr to __m512 reg - inline __m512 mm512_uni_loadu_tail_ps(const float* a, size_t count) { - __mmask16 mask = (1 << count) - 1; - return _mm512_maskz_loadu_ps(mask, a); - } - - inline __m512 mm512_uni_loadu_tail_ps(const ov::bfloat16* a, size_t count) { - auto mask = (1 << count) - 1; - auto bf16_vec = _mm256_maskz_loadu_epi16(mask, a); - return cvt_bf16_to_fp32(bf16_vec); - } - - inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) { - auto mask = (1 << count) - 1; - auto f16_vec = _mm256_maskz_loadu_epi16(mask, a); - return _mm512_cvtph_ps(f16_vec); - } - - // store __m512 reg to addr - inline void mm512_uni_storeu_ps(float* a, __m512 v) { - _mm512_storeu_ps(a, v); - } - inline void mm512_uni_storeu_ps(ov::bfloat16 *addr, __m512 xps) { - __m512i xpi32 = _mm512_castps_si512(xps); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 - _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), _mm512_cvtepi32_epi16(x)); - } - - inline void mm512_uni_storeu_ps(ov::float16* addr, __m512 v) { - __m256i vec_f16 = _mm512_cvtps_ph(v, 0); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(addr), vec_f16); - } - - // store __m512 reg to addr - inline void mm512_uni_mask_storeu_ps(ov::bfloat16 *addr, __mmask16 mask_addr, __m512 xps) { - __m512i xpi32 = _mm512_castps_si512(xps); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 - _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); - } - - inline void mm512_uni_storeu_tail_ps(float *addr, __m512 v, size_t count) { - __mmask16 mask_addr = (1 << count) - 1; - _mm512_mask_storeu_ps(addr, mask_addr, v); - } - - inline void mm512_uni_storeu_tail_ps(ov::bfloat16 *addr, __m512 v, size_t count) { - __mmask16 mask_addr = (1 << count) - 1; - __m512i xpi32 = _mm512_castps_si512(v); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask = _mm512_cmp_ps_mask(v, v, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 - _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); - } - - inline void mm512_uni_storeu_tail_ps(ov::float16 *addr, __m512 v, size_t count) { - __mmask16 mask_addr = (1 << count) - 1; - __m256i vec_f16 = _mm512_cvtps_ph(v, 0); - _mm256_mask_storeu_epi16(reinterpret_cast<__m256i *>(addr), mask_addr, vec_f16); - } +inline __m512 cvt_bf16_to_fp32(const __m256i src) { + __m512i y = _mm512_cvtepu16_epi32(src); + return _mm512_castsi512_ps(_mm512_slli_epi32(y, 16)); +} + +// load addr to __m512 reg +inline __m512 mm512_uni_loadu_ps(const float* a) { + return _mm512_loadu_ps(a); +} + +inline __m512 mm512_uni_loadu_ps(const ov::bfloat16* a) { + auto vec_bf16 = _mm256_loadu_si256(reinterpret_cast(a)); + return cvt_bf16_to_fp32(vec_bf16); +} + +inline __m512 mm512_uni_loadu_ps(const ov::float16* a) { + auto vec_f16 = _mm256_loadu_si256(reinterpret_cast(a)); + return _mm512_cvtph_ps(vec_f16); +} + +// load addr to __m512 reg +inline __m512 mm512_uni_loadu_tail_ps(const float* a, size_t count) { + __mmask16 mask = (1 << count) - 1; + return _mm512_maskz_loadu_ps(mask, a); +} + +inline __m512 mm512_uni_loadu_tail_ps(const ov::bfloat16* a, size_t count) { + auto mask = (1 << count) - 1; + auto bf16_vec = _mm256_maskz_loadu_epi16(mask, a); + return cvt_bf16_to_fp32(bf16_vec); +} + +inline __m512 mm512_uni_loadu_tail_ps(const ov::float16* a, size_t count) { + auto mask = (1 << count) - 1; + auto f16_vec = _mm256_maskz_loadu_epi16(mask, a); + return _mm512_cvtph_ps(f16_vec); +} + +// store __m512 reg to addr +inline void mm512_uni_storeu_ps(float* a, __m512 v) { + _mm512_storeu_ps(a, v); +} +inline void mm512_uni_storeu_ps(ov::bfloat16* addr, __m512 xps) { + __m512i xpi32 = _mm512_castps_si512(xps); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 + _mm256_storeu_si256(reinterpret_cast<__m256i*>(addr), _mm512_cvtepi32_epi16(x)); +} + +inline void mm512_uni_storeu_ps(ov::float16* addr, __m512 v) { + __m256i vec_f16 = _mm512_cvtps_ph(v, 0); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(addr), vec_f16); +} + +// store __m512 reg to addr +inline void mm512_uni_mask_storeu_ps(ov::bfloat16* addr, __mmask16 mask_addr, __m512 xps) { + __m512i xpi32 = _mm512_castps_si512(xps); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask = _mm512_cmp_ps_mask(xps, xps, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 + _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); +} + +inline void mm512_uni_storeu_tail_ps(float* addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + _mm512_mask_storeu_ps(addr, mask_addr, v); +} + +inline void mm512_uni_storeu_tail_ps(ov::bfloat16* addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + __m512i xpi32 = _mm512_castps_si512(v); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask = _mm512_cmp_ps_mask(v, v, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + auto x = _mm512_and_si512(_mm512_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm512_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm512_srli_epi32(_mm512_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm512_mask_blend_epi32(mask, nan, x); // Check NaN before converting back to bf16 + _mm512_mask_cvtepi32_storeu_epi16(addr, mask_addr, x); +} + +inline void mm512_uni_storeu_tail_ps(ov::float16* addr, __m512 v, size_t count) { + __mmask16 mask_addr = (1 << count) - 1; + __m256i vec_f16 = _mm512_cvtps_ph(v, 0); + _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(addr), mask_addr, vec_f16); +} #endif #ifdef HAVE_AVX2 - inline __m256i get_mask(int N7) { - static __m256i mask[] = { - _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0), - _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1), - _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1), - _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1), - _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1), - _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1), - _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), - _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1), - _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1), - }; - return _mm256_loadu_si256(&mask[N7]); - } - - // load addr to __m256 reg - inline __m256 mm256_uni_loadu_ps(const float* a) { - return _mm256_loadu_ps(a); - } - - inline __m256 mm256_uni_loadu_ps(const ov::bfloat16* a) { - auto vec_bf16 = _mm_loadu_si128(reinterpret_cast(a)); - auto o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(vec_bf16), 16)); - return o; - } - - inline __m256 mm256_uni_loadu_ps(const ov::float16* a) { - auto vec_f16 = _mm_loadu_si128(reinterpret_cast(a)); - auto o = _mm256_cvtph_ps(vec_f16); - return o; - } - - // load addr tail to __m256 reg - inline __m256 mm256_uni_loadu_tail_ps(const float* a, const size_t count) { - auto mask = get_mask(count); - return _mm256_maskload_ps(a, mask); - } - - inline __m256 mm256_uni_loadu_tail_ps(const ov::bfloat16* a, const size_t count) { - assert("AVX2 version of bfloat16 tail load is just for compilation pass"); - ov::bfloat16 tmp_values[8] = {0}; - std::memcpy(tmp_values, a, count * sizeof(ov::bfloat16)); - return mm256_uni_loadu_ps(tmp_values); - } - - inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) { - ov::float16 tmp_values[8] = {0}; - std::memcpy(tmp_values, a, count * sizeof(ov::float16)); - return mm256_uni_loadu_ps(tmp_values); - } - - // store __m256 reg to addr - inline void mm256_uni_storeu_ps(float* a, __m256 v) { - _mm256_storeu_ps(a, v); - } - - inline void mm256_uni_storeu_ps(ov::bfloat16 *addr, __m256 xps) { - __m256i xpi32 = _mm256_castps_si256(xps); - __m256i nan = _mm256_set1_epi32(0xffff); - __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(xps, xps, _CMP_ORD_Q)); - __m256i ones = _mm256_set1_epi32(0x1); - __m256i vec_bias = _mm256_set1_epi32(0x7fff); - auto x = _mm256_and_si256(_mm256_srli_epi32(xpi32, 16), ones); // LSB = x[16] - x = _mm256_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB - x = _mm256_srli_epi32(_mm256_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; - x = _mm256_blendv_epi8(nan, x, mask); // Check NaN before converting back to bf16 - x = _mm256_packus_epi32(x, x); - x = _mm256_permute4x64_epi64(x, 0xd8); - __m128i bf16_o = _mm256_extractf128_si256(x, 0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(addr), bf16_o); - } - - inline void mm256_uni_storeu_ps(ov::float16* a, __m256 v) { - __m128i vec_f16 = _mm256_cvtps_ph(v, 0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(a), vec_f16); - } - - // store __m256 to addr - inline void mm256_uni_storeu_tail_ps(float *addr, __m256 v, size_t count) { - const auto mask = get_mask(count); - return _mm256_maskstore_ps(addr, mask, v); - } - - inline void hsum(__m256& x) { - __m256 y; // x: 0 1 2 3 4 5 6 7 - y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 - x = _mm256_add_ps(x, y); // X: 01 12 23 30 45 56 67 74 - y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 - x = _mm256_add_ps(x, y); // x: 0123 x x x 4567 x x x - y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x - x = _mm256_add_ps(x, y); // x: 01234567 x x x x x x x - } - inline void hmax(__m256& x) { - __m256 y; // x: 0 1 2 3 4 5 6 7 - y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 - x = _mm256_max_ps(x, y); // X: 01 12 23 30 45 56 67 74 - y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 - x = _mm256_max_ps(x, y); // x: 0123 x x x 4567 x x x - y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x - x = _mm256_max_ps(x, y); // x: 01234567 x x x x x x x - } - inline void hmin(__m256& x) { - __m256 y; // x: 0 1 2 3 4 5 6 7 - y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 - x = _mm256_min_ps(x, y); // X: 01 12 23 30 45 56 67 74 - y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 - x = _mm256_min_ps(x, y); // x: 0123 x x x 4567 x x x - y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x - x = _mm256_min_ps(x, y); // x: 01234567 x x x x x x x - } +inline __m256i get_mask(int N7) { + static __m256i mask[] = { + _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0), + _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1), + _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1), + _mm256_set_epi32(0, 0, 0, 0, 0, -1, -1, -1), + _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1), + _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1), + _mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1), + _mm256_set_epi32(0, -1, -1, -1, -1, -1, -1, -1), + _mm256_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1), + }; + return _mm256_loadu_si256(&mask[N7]); +} + +// load addr to __m256 reg +inline __m256 mm256_uni_loadu_ps(const float* a) { + return _mm256_loadu_ps(a); +} + +inline __m256 mm256_uni_loadu_ps(const ov::bfloat16* a) { + auto vec_bf16 = _mm_loadu_si128(reinterpret_cast(a)); + auto o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(vec_bf16), 16)); + return o; +} + +inline __m256 mm256_uni_loadu_ps(const ov::float16* a) { + auto vec_f16 = _mm_loadu_si128(reinterpret_cast(a)); + auto o = _mm256_cvtph_ps(vec_f16); + return o; +} + +// load addr tail to __m256 reg +inline __m256 mm256_uni_loadu_tail_ps(const float* a, const size_t count) { + auto mask = get_mask(count); + return _mm256_maskload_ps(a, mask); +} + +inline __m256 mm256_uni_loadu_tail_ps(const ov::bfloat16* a, const size_t count) { + assert("AVX2 version of bfloat16 tail load is just for compilation pass"); + ov::bfloat16 tmp_values[8] = {0}; + std::memcpy(tmp_values, a, count * sizeof(ov::bfloat16)); + return mm256_uni_loadu_ps(tmp_values); +} + +inline __m256 mm256_uni_loadu_tail_ps(const ov::float16* a, const size_t count) { + ov::float16 tmp_values[8] = {0}; + std::memcpy(tmp_values, a, count * sizeof(ov::float16)); + return mm256_uni_loadu_ps(tmp_values); +} + +// store __m256 reg to addr +inline void mm256_uni_storeu_ps(float* a, __m256 v) { + _mm256_storeu_ps(a, v); +} + +inline void mm256_uni_storeu_ps(ov::bfloat16* addr, __m256 xps) { + __m256i xpi32 = _mm256_castps_si256(xps); + __m256i nan = _mm256_set1_epi32(0xffff); + __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(xps, xps, _CMP_ORD_Q)); + __m256i ones = _mm256_set1_epi32(0x1); + __m256i vec_bias = _mm256_set1_epi32(0x7fff); + auto x = _mm256_and_si256(_mm256_srli_epi32(xpi32, 16), ones); // LSB = x[16] + x = _mm256_add_epi32(x, vec_bias); // rounding_bias = 0x7fff + LSB + x = _mm256_srli_epi32(_mm256_add_epi32(x, xpi32), 16); // x = (x + rounding_bias) >> 16; + x = _mm256_blendv_epi8(nan, x, mask); // Check NaN before converting back to bf16 + x = _mm256_packus_epi32(x, x); + x = _mm256_permute4x64_epi64(x, 0xd8); + __m128i bf16_o = _mm256_extractf128_si256(x, 0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(addr), bf16_o); +} + +inline void mm256_uni_storeu_ps(ov::float16* a, __m256 v) { + __m128i vec_f16 = _mm256_cvtps_ph(v, 0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(a), vec_f16); +} + +// store __m256 to addr +inline void mm256_uni_storeu_tail_ps(float* addr, __m256 v, size_t count) { + const auto mask = get_mask(count); + return _mm256_maskstore_ps(addr, mask, v); +} + +inline void hsum(__m256& x) { + __m256 y; // x: 0 1 2 3 4 5 6 7 + y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 + x = _mm256_add_ps(x, y); // X: 01 12 23 30 45 56 67 74 + y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 + x = _mm256_add_ps(x, y); // x: 0123 x x x 4567 x x x + y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x + x = _mm256_add_ps(x, y); // x: 01234567 x x x x x x x +} +inline void hmax(__m256& x) { + __m256 y; // x: 0 1 2 3 4 5 6 7 + y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 + x = _mm256_max_ps(x, y); // X: 01 12 23 30 45 56 67 74 + y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 + x = _mm256_max_ps(x, y); // x: 0123 x x x 4567 x x x + y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x + x = _mm256_max_ps(x, y); // x: 01234567 x x x x x x x +} +inline void hmin(__m256& x) { + __m256 y; // x: 0 1 2 3 4 5 6 7 + y = _mm256_permute_ps(x, 0x39); // y: 1 2 3 0 5 6 7 4 + x = _mm256_min_ps(x, y); // X: 01 12 23 30 45 56 67 74 + y = _mm256_permute_ps(x, 0x4e); // y: 23 30 01 12 67 74 45 56 + x = _mm256_min_ps(x, y); // x: 0123 x x x 4567 x x x + y = _mm256_permute2f128_ps(x, x, 1); // y: 4567 x x x 0123 x x x + x = _mm256_min_ps(x, y); // x: 01234567 x x x x x x x +} #endif #ifdef OPENVINO_ARCH_ARM64 - inline float32x4_t exp_ps_neon_f32(const float32x4_t& src) { - const auto c1 = vreinterpretq_f32_u32(vdupq_n_u32(0x3f7ffff6)); - const auto c2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3efffedb)); - const auto c3 = vreinterpretq_f32_u32(vdupq_n_u32(0x3e2aaf33)); - const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(0x3d2b9f17)); - const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(0x3c072010)); - - const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f - const auto one = vdupq_n_f32(1.0f); // 1 - const auto two = vdupq_n_f32(2.0f); // 2 - const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); - const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); - const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); - - const auto inf = vdupq_n_f32(std::numeric_limits::infinity()); - const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5) - const auto zero = vdupq_n_f32(0.f); - const auto min_input = vdupq_n_f32(-86.64f); // Approximately ln(2^-125) - - const auto z = vmlaq_f32(shift, src, inv_ln2); - auto n = z - shift; - n = vsubq_f32(n, one); - const auto scale = vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n - - const auto r_hi = vfmaq_f32(src, n, neg_ln2_hi); - const auto r = vfmaq_f32(r_hi, n, neg_ln2_lo); - - const auto r2 = r * r; - - const auto p1 = c1 * r; - const auto p23 = vfmaq_f32(c2, c3, r); - const auto p45 = vfmaq_f32(c4, c5, r); - const auto p2345 = vfmaq_f32(p23, p45, r2); - const auto p12345 = vfmaq_f32(p1, p2345, r2); - - auto poly = vfmaq_f32(scale, p12345, scale); - poly = vmulq_f32(poly, two); - - poly = vbslq_f32(vcltq_f32(src, min_input), zero, poly); - poly = vbslq_f32(vcgtq_f32(src, max_input), inf, poly); - - return poly; - } - inline float32x4_t __vld1q_f32(const ov::bfloat16* a) { - uint16x4_t vec_bf16 = vld1_u16(reinterpret_cast(a)); - - float32x4_t vec_f32 = vcvtq_f32_u32(vmovl_u16(vec_bf16)); - return vec_f32; - } - inline float32x4_t __vld1q_f32(const float* a) { - return vld1q_f32(a); - } - inline float32x4_t __vld1q_f32(const ov::float16* a) { - auto _a = reinterpret_cast(a); - return vcvt_f32_f16(vld1_f16(_a)); - } - inline void __vst1q_f32(float* a, float32x4_t b) { - vst1q_f32(a, b); - } - inline void __vst1q_f32(ov::float16* a, float32x4_t b) { - float16x4_t v_f16 = vcvt_f16_f32(b); - vst1_f16(reinterpret_cast(a), v_f16); - } - inline void __vst1q_f32(ov::bfloat16* a, float32x4_t b) { - uint32x4_t v_int32 = vreinterpretq_u32_f32(b); - uint16x4_t v_bf16 = vshrn_n_u32(v_int32, 16); - - vst1_u16(reinterpret_cast(a), v_bf16); - } +inline float32x4_t exp_ps_neon_f32(const float32x4_t& src) { + const auto c1 = vreinterpretq_f32_u32(vdupq_n_u32(0x3f7ffff6)); + const auto c2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3efffedb)); + const auto c3 = vreinterpretq_f32_u32(vdupq_n_u32(0x3e2aaf33)); + const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(0x3d2b9f17)); + const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(0x3c072010)); + + const auto shift = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f + const auto one = vdupq_n_f32(1.0f); // 1 + const auto two = vdupq_n_f32(2.0f); // 2 + const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); + const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); + const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); + + const auto inf = vdupq_n_f32(std::numeric_limits::infinity()); + const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5) + const auto zero = vdupq_n_f32(0.f); + const auto min_input = vdupq_n_f32(-86.64f); // Approximately ln(2^-125) + + const auto z = vmlaq_f32(shift, src, inv_ln2); + auto n = z - shift; + n = vsubq_f32(n, one); + const auto scale = vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n + + const auto r_hi = vfmaq_f32(src, n, neg_ln2_hi); + const auto r = vfmaq_f32(r_hi, n, neg_ln2_lo); + + const auto r2 = r * r; + + const auto p1 = c1 * r; + const auto p23 = vfmaq_f32(c2, c3, r); + const auto p45 = vfmaq_f32(c4, c5, r); + const auto p2345 = vfmaq_f32(p23, p45, r2); + const auto p12345 = vfmaq_f32(p1, p2345, r2); + + auto poly = vfmaq_f32(scale, p12345, scale); + poly = vmulq_f32(poly, two); + + poly = vbslq_f32(vcltq_f32(src, min_input), zero, poly); + poly = vbslq_f32(vcgtq_f32(src, max_input), inf, poly); + + return poly; +} +inline float32x4_t __vld1q_f32(const ov::bfloat16* a) { + uint16x4_t vec_bf16 = vld1_u16(reinterpret_cast(a)); + + float32x4_t vec_f32 = vcvtq_f32_u32(vmovl_u16(vec_bf16)); + return vec_f32; +} +inline float32x4_t __vld1q_f32(const float* a) { + return vld1q_f32(a); +} +inline float32x4_t __vld1q_f32(const ov::float16* a) { + auto _a = reinterpret_cast(a); + return vcvt_f32_f16(vld1_f16(_a)); +} +inline void __vst1q_f32(float* a, float32x4_t b) { + vst1q_f32(a, b); +} +inline void __vst1q_f32(ov::float16* a, float32x4_t b) { + float16x4_t v_f16 = vcvt_f16_f32(b); + vst1_f16(reinterpret_cast(a), v_f16); +} +inline void __vst1q_f32(ov::bfloat16* a, float32x4_t b) { + uint32x4_t v_int32 = vreinterpretq_u32_f32(b); + uint16x4_t v_bf16 = vshrn_n_u32(v_int32, 16); + + vst1_u16(reinterpret_cast(a), v_bf16); +} #endif #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - inline float16x8_t exp_ps_neon_f16(float16x8_t x) { - const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); - const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); - - // We use f32 to maintain accuracy - const float16x8_t res = vcombine_f16(vcvt_f16_f32(exp_ps_neon_f32(x_low)), vcvt_f16_f32(exp_ps_neon_f32(x_high))); - return res; - } - inline float16_t hsum(float16x8_t vec) { - float16x4_t sum1 = vpadd_f16(vget_low_f16(vec), vget_high_f16(vec)); - float16x4_t sum2 = vpadd_f16(sum1, sum1); - float16x4_t sum3 = vpadd_f16(sum2, sum2); - return vget_lane_f16(sum3, 0); - } +inline float16x8_t exp_ps_neon_f16(float16x8_t x) { + const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x)); + const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x)); + + // We use f32 to maintain accuracy + const float16x8_t res = vcombine_f16(vcvt_f16_f32(exp_ps_neon_f32(x_low)), vcvt_f16_f32(exp_ps_neon_f32(x_high))); + return res; +} +inline float16_t hsum(float16x8_t vec) { + float16x4_t sum1 = vpadd_f16(vget_low_f16(vec), vget_high_f16(vec)); + float16x4_t sum2 = vpadd_f16(sum1, sum1); + float16x4_t sum3 = vpadd_f16(sum2, sum2); + return vget_lane_f16(sum3, 0); +} #endif } // namespace XARCH } // namespace Cpu diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index bef34881ca41bc..a74021d8ac0d05 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -13,19 +13,19 @@ # include #endif -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/type/float16.hpp" -#include "openvino/core/parallel.hpp" +#include "attn_memcpy.hpp" +#include "attn_quant.hpp" +#include "attn_quant_kernel.hpp" +#include "common.hpp" #include "executor_pa.hpp" #include "executor_pa_common.hpp" -#include "common.hpp" -#include "attn_quant_kernel.hpp" +#include "nodes/kernels/x64/brgemm_kernel.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" +#include "openvino/core/type/float16.hpp" #include "softmax_kernel.hpp" #include "transpose_kernel.hpp" #include "utils/plain_tensor.hpp" -#include "attn_memcpy.hpp" -#include "attn_quant.hpp" -#include "nodes/kernels/x64/brgemm_kernel.hpp" namespace ov { namespace Extensions { @@ -38,42 +38,43 @@ using namespace ov::intel_cpu; // currently depends on brgemm which only support x64 #ifdef OPENVINO_ARCH_X86_64 -#if defined(HAVE_AVX2) || defined(HAVE_AVX512F) +# if defined(HAVE_AVX2) || defined(HAVE_AVX512F) -#define prefetch_bytes(bytes, sel, advance, src) { \ - auto *p = reinterpret_cast(src); \ - for (size_t i = 0; i < bytes; i += 64) \ - _mm_prefetch(p + i + advance, sel); \ -} +# define prefetch_bytes(bytes, sel, advance, src) \ + { \ + auto* p = reinterpret_cast(src); \ + for (size_t i = 0; i < bytes; i += 64) \ + _mm_prefetch(p + i + advance, sel); \ + } -#else +# else -#define prefetch_bytes(bytes, sel, advance, src) +# define prefetch_bytes(bytes, sel, advance, src) -#endif +# endif -template +template void cvt_copy(TA* dst, TB* src, size_t n) { size_t i = 0; -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { auto vb = mm512_uni_loadu_ps(src + i); mm512_uni_storeu_ps(dst + i, vb); } -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) { auto vb = mm256_uni_loadu_ps(src + i); mm256_uni_storeu_ps(dst + i, vb); } -#endif +# endif for (; i < n; i++) { dst[i] = src[i]; } } -template +template static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size_t block_size) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto attn_w_vec0 = _mm512_set1_ps(weight[0]); @@ -132,7 +133,7 @@ static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size } } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto attn_w_vec0 = _mm256_set1_ps(weight[0]); @@ -191,7 +192,7 @@ static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size } } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { for (size_t i = 0; i < S; i++) { out[i] += weight[j] * v[i]; @@ -202,9 +203,9 @@ static void attn_acc_value_block(float* out, float* weight, T* v, size_t S, size static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S, size_t block_size) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) -#if defined(HAVE_AVX512F) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto v_f0 = reinterpret_cast(v); @@ -223,10 +224,18 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S v += 8; for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto v_out = mm512_uni_loadu_ps(out + i); - auto v0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), zp0); - auto v1 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + S + 8)))), zp1); - auto v2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 2 * (S + 8))))), zp2); - auto v3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 3 * (S + 8))))), zp3); + auto v0 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), + zp0); + auto v1 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + S + 8)))), + zp1); + auto v2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 2 * (S + 8))))), + zp2); + auto v3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(v + i + 3 * (S + 8))))), + zp3); v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out); v_out = _mm512_fmadd_ps(attn_w_vec1, v1, v_out); v_out = _mm512_fmadd_ps(attn_w_vec2, v2, v_out); @@ -251,7 +260,9 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S v += 8; for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto v_out = mm512_uni_loadu_ps(out + i); - auto v0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), zp0); + auto v0 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(v + i)))), + zp0); v_out = _mm512_fmadd_ps(attn_w_vec0, v0, v_out); _mm512_storeu_ps(out + i, v_out); @@ -263,7 +274,7 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S weight++; } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j < block_size; j++) { auto v_f0 = reinterpret_cast(v); @@ -273,7 +284,9 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S v += 8; for (; i + vec_len_f32_avx2 <= S; i += vec_len_f32_avx2) { auto v_out = mm256_uni_loadu_ps(out + i); - auto v0 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(v + i)))), zp0); + auto v0 = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(v + i)))), + zp0); v_out = _mm256_fmadd_ps(attn_w_vec0, v0, v_out); mm256_uni_storeu_ps(out + i, v_out); @@ -285,7 +298,7 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S weight++; } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { auto v0 = reinterpret_cast(v); v += 8; @@ -296,9 +309,9 @@ static void attn_acc_value_block(float* out, float* weight, uint8_t* v, size_t S } } -template +template static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_size) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm512_setzero_ps(); @@ -328,7 +341,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz c[2] = sum2; c[3] = sum3; c += 4; - b += 4 * n; + b += 4 * n; } for (; j < block_size; j++) { auto vsum = _mm512_setzero_ps(); @@ -345,7 +358,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz *c++ = sum; } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm256_set1_ps(0.0f); @@ -379,7 +392,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz c[2] = sum2; c[3] = sum3; c += 4; - b += 4 * n; + b += 4 * n; } for (; j < block_size; j++) { auto vsum = _mm256_set1_ps(0.0f); @@ -397,7 +410,7 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz *c++ = sum; } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { float sum = 0; for (size_t i = 0; i < n; i++) { @@ -408,12 +421,12 @@ static void dot_product_block(TA* a, TB* b, float* c, size_t n, size_t block_siz } } -template +template static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t block_size) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) -#if defined(HAVE_AVX512F) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) +# if defined(HAVE_AVX512F) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm512_setzero_ps(); @@ -432,10 +445,18 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { auto va = mm512_uni_loadu_ps(a + i); - auto vb0 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), v_zp0); - auto vb1 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + n + 8)))), v_zp1); - auto vb2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), v_zp2); - auto vb3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), v_zp3); + auto vb0 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), + v_zp0); + auto vb1 = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + n + 8)))), + v_zp1); + auto vb2 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), + v_zp2); + auto vb3 = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32( + _mm_loadu_si128(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), + v_zp3); vsum0 = _mm512_fmadd_ps(va, vb0, vsum0); vsum1 = _mm512_fmadd_ps(va, vb1, vsum1); @@ -457,7 +478,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc c[2] = sum2 * b2[0]; c[3] = sum3 * b3[0]; c += 4; - b += 4 * (n + 8) - 8; + b += 4 * (n + 8) - 8; } for (; j < block_size; j++) { auto vsum = _mm512_setzero_ps(); @@ -467,7 +488,9 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx512 <= n; i += vec_len_f32_avx512) { auto va = mm512_uni_loadu_ps(a + i); - auto vb = _mm512_sub_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), v_zp); + auto vb = _mm512_sub_ps( + _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(b + i)))), + v_zp); vsum = _mm512_fmadd_ps(va, vb, vsum); } float sum = _mm512_reduce_add_ps(vsum); @@ -478,7 +501,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc *c++ = sum * b0[0]; } return; -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) size_t j = 0; for (; j + 4 <= block_size; j += 4) { auto vsum0 = _mm256_setzero_ps(); @@ -497,10 +520,18 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) { auto va = mm256_uni_loadu_ps(a + i); - auto vb0 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), v_zp0); - auto vb1 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + n + 8)))), v_zp1); - auto vb2 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), v_zp2); - auto vb3 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), v_zp3); + auto vb0 = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), + v_zp0); + auto vb1 = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + n + 8)))), + v_zp1); + auto vb2 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32( + _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 2 * (n + 8))))), + v_zp2); + auto vb3 = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32( + _mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i + 3 * (n + 8))))), + v_zp3); vsum0 = _mm256_fmadd_ps(va, vb0, vsum0); vsum1 = _mm256_fmadd_ps(va, vb1, vsum1); @@ -526,7 +557,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc c[2] = sum2 * b2[0]; c[3] = sum3 * b3[0]; c += 4; - b += 4 * (n + 8) - 8; + b += 4 * (n + 8) - 8; } for (; j < block_size; j++) { auto vsum = _mm256_setzero_ps(); @@ -536,7 +567,9 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc b += 8; for (; i + vec_len_f32_avx2 <= n; i += vec_len_f32_avx2) { auto va = mm256_uni_loadu_ps(a + i); - auto vb = _mm256_sub_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), v_zp); + auto vb = _mm256_sub_ps( + _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(b + i)))), + v_zp); vsum = _mm256_fmadd_ps(va, vb, vsum); } hsum(vsum); @@ -548,7 +581,7 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc *c++ = sum * b0[0]; } return; -#endif +# endif for (size_t j = 0; j < block_size; j++) { float sum = 0; auto b0 = reinterpret_cast(b); @@ -561,11 +594,11 @@ static void dot_product_block(TA* a, uint8_t* b, float* c, size_t n, size_t bloc } } -template +template static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_stride) { size_t i = 0; -#if defined(HAVE_AVX512F) - for (; i + vec_len_f32_avx512 <= S; i+= vec_len_f32_avx512) { +# if defined(HAVE_AVX512F) + for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto* src = temp + i; auto result_vec_fp32 = _mm512_setzero_ps(); for (size_t m = 0; m < M; m++) { @@ -576,7 +609,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str // save to bf16 mm512_uni_storeu_ps(dst + i, result_vec_fp32); } -#elif defined(HAVE_AVX2) +# elif defined(HAVE_AVX2) for (; i + vec_len_f32_avx2 <= S; i += vec_len_f32_avx2) { auto* src = temp + i; auto result_vec_fp32 = _mm256_set1_ps(0.0f); @@ -587,7 +620,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str } mm256_uni_storeu_ps(dst + i, result_vec_fp32); } -#endif +# endif for (; i < S; i++) { auto* src = temp + i; float sum = 0.0f; @@ -601,7 +634,7 @@ static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_str } // N must be multiple of 16 -template +template void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { size_t k = 0; for (; k + 16 <= K; k += 16) { @@ -619,24 +652,26 @@ void transpose_16NxK(TDST* dst, TSRC* src, TDST* tmp, size_t N, size_t K, size_t } } -#if defined(HAVE_AVX512F) -template::value || std::is_same::value), bool>::type> +# if defined(HAVE_AVX512F) +template ::value || std::is_same::value), bool>::type> static void transpose_16NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // will treat as uint32_t transpose auto s = reinterpret_cast(src); auto d = reinterpret_cast(dst); transpose_16NxK(d, s, reinterpret_cast(0), N, K >> 1, dst_stride, src_stride >> 1); } -#endif +# endif -template +template void transpose_16NxK(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) auto s = src; auto t = tmp; - for (size_t n = 0; n < N; n ++) { + for (size_t n = 0; n < N; n++) { auto f = reinterpret_cast(s); attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]); s += src_stride + 2 * sizeof(float); @@ -646,7 +681,7 @@ void transpose_16NxK(TDST* dst, uint8_t* src, TDST* tmp, size_t N, size_t K, siz } // dequant f16/u8 to float -template +template static inline void dequant(T* dst, T* src, size_t N, size_t K) { // never called OPENVINO_THROW("dequant: should not be called."); @@ -656,13 +691,13 @@ static inline void dequant(float* dst, ov::float16* src, size_t N, size_t K) { cvt_copy(dst, src, K * N); } -template +template void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) auto s = src; - for (size_t n = 0; n < N; n ++) { + for (size_t n = 0; n < N; n++) { auto f = reinterpret_cast(s); attn_dequant_u8_kernel(s + 2 * sizeof(float), dst, K, f[0], f[1]); s += K + 2 * sizeof(float); @@ -670,18 +705,24 @@ void dequant(TDST* dst, uint8_t* src, size_t N, size_t K) { } } -#if defined(HAVE_AVX512F) -template::value || std::is_same::value), bool>::type> +# if defined(HAVE_AVX512F) +template ::value || std::is_same::value), bool>::type> static void pack_32x32_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { - auto a = _mm512_loadu_si512(src); // [a1 a2 a3 a4 | a5 a6 a7 a8] total 512-bits in 8 64bits unit + auto a = _mm512_loadu_si512(src); // [a1 a2 a3 a4 | a5 a6 a7 a8] total 512-bits in 8 64bits unit auto b = _mm512_loadu_si512(src + src_stride); // [b1 b2 b3 b4 | b5 b6 b7 b8] total 512-bits a = _mm512_permutexvar_epi64(midx, a); // [a1 a5 | a2 a6 | a3 a7 | a4 a8] b = _mm512_permutexvar_epi64(midx, b); // [b1 b5 | b2 b6 | b3 b7 | b4 b8] - auto B0 = _mm512_unpacklo_epi16(a, b); // [ a1&b1 a2&b2 a3&b3 a4&b4] for each 128-bits lane, interleave word in low 64 bits - auto B1 = _mm512_unpackhi_epi16(a, b); // [ a5&b5 a6&b6 a7&b7 a8&b8] for each 128-bits lane, interleave word in high 64 bits + auto B0 = _mm512_unpacklo_epi16( + a, + b); // [ a1&b1 a2&b2 a3&b3 a4&b4] for each 128-bits lane, interleave word in low 64 bits + auto B1 = _mm512_unpackhi_epi16( + a, + b); // [ a5&b5 a6&b6 a7&b7 a8&b8] for each 128-bits lane, interleave word in high 64 bits _mm512_storeu_si512(dst, B0); _mm512_storeu_si512(dst + 32, B1); src += 2 * src_stride; @@ -689,17 +730,20 @@ static void pack_32x32_kernel(T* dst, T* src, size_t dst_stride, size_t src_stri } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32x16_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); for (size_t i = 0; i < 16; i++) { - auto x = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src)); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit + auto x = + _mm256_loadu_si256(reinterpret_cast<__m256i*>(src)); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit auto y = _mm256_loadu_si256(reinterpret_cast<__m256i*>(src + src_stride)); // [b1 b2 b3 b4] total 256-bits auto a = _mm512_castsi256_si512(x); auto b = _mm512_castsi256_si512(y); - a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] - b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] + a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] + b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] auto B0 = _mm512_unpacklo_epi16(a, b); _mm512_storeu_si512(dst, B0); src += 2 * src_stride; @@ -707,18 +751,20 @@ static void pack_32x16_kernel(T* dst, T* src, size_t dst_stride, size_t src_stri } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32xK_kernel(T* dst, T* src, size_t dst_stride, size_t src_stride, size_t K) { static const uint64_t idx[8] = {0, 4, 1, 5, 2, 6, 3, 7}; auto midx = _mm512_loadu_si512(idx); __mmask16 mask = (1 << K) - 1; for (size_t i = 0; i < K; i++) { - auto x = _mm256_maskz_loadu_epi16(mask, src); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit - auto y = _mm256_maskz_loadu_epi16(mask, src + src_stride); // [b1 b2 b3 b4] total 256-bits + auto x = _mm256_maskz_loadu_epi16(mask, src); // [a1 a2 a3 a4] total 256-bits in 4 64bits unit + auto y = _mm256_maskz_loadu_epi16(mask, src + src_stride); // [b1 b2 b3 b4] total 256-bits auto a = _mm512_castsi256_si512(x); auto b = _mm512_castsi256_si512(y); - a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] - b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] + a = _mm512_permutexvar_epi64(midx, a); // [a1 x | a2 x | a3 x | a4 x] + b = _mm512_permutexvar_epi64(midx, b); // [b1 x | b2 x | b3 x | b4 x] auto B0 = _mm512_unpacklo_epi16(a, b); _mm512_mask_storeu_epi32(dst, mask, B0); src += 2 * src_stride; @@ -726,7 +772,9 @@ static void pack_32xK_kernel(T* dst, T* src, size_t dst_stride, size_t src_strid } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { for (size_t n = 0; n < N; n += 32) { size_t k = 0; @@ -746,14 +794,16 @@ static void pack_32NxK(T* dst, T* src, T* tmp, size_t N, size_t K, size_t dst_st } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> static void pack_32NxK(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // The layout for per token per head: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The quantized feature will start from 8bytes=sizeof(float)+sizeof(float) auto s = src; auto t = tmp; - for (size_t n = 0; n < N; n ++) { + for (size_t n = 0; n < N; n++) { auto f = reinterpret_cast(s); attn_dequant_u8_kernel(s + 2 * sizeof(float), t, K, f[0], f[1]); s += src_stride + 2 * sizeof(float); @@ -761,9 +811,9 @@ static void pack_32NxK(T* dst, uint8_t* src, T* tmp, size_t N, size_t K, size_t } pack_32NxK(dst, tmp, reinterpret_cast(0), N, K, dst_stride, src_stride); } -#endif +# endif -template +template static void pack_32NxK(float* dst, T* src, float* tmp, size_t N, size_t K, size_t dst_stride, size_t src_stride) { // never called OPENVINO_THROW("pack_32NxK: should not be called."); @@ -782,10 +832,10 @@ struct MHAHelper { size_t _sliding_window; float _d_scale; - PlainTensor _weight; // [nthr, H, 32, rnd_up(kv_len, block_size)], shared by first and second loop along bh - PlainTensor _output; // [nthr, 32, H, S], shared by first and second loop along bh - PlainTensor _qk_scratch_a; // [nthr, scratch_a_size] - PlainTensor _qk_scratch_b; // [B, rnd_up(kv_len, block_size), Hk, scratch_b_size] + PlainTensor _weight; // [nthr, H, 32, rnd_up(kv_len, block_size)], shared by first and second loop along bh + PlainTensor _output; // [nthr, 32, H, S], shared by first and second loop along bh + PlainTensor _qk_scratch_a; // [nthr, scratch_a_size] + PlainTensor _qk_scratch_b; // [B, rnd_up(kv_len, block_size), Hk, scratch_b_size] PlainTensor _wv_scratch_a; PlainTensor _wv_scratch_b; PlainTensor _alibi_lookup; @@ -810,8 +860,16 @@ struct MHAHelper { _weight.resize({size_t{1}, size_t{1}, size_t{1}, size_t{1}}); } - void init(size_t H, size_t S, size_t SV, size_t Hk, size_t h_each_group_len, size_t block_size, size_t sliding_window, - float d_scale, size_t kv_len, bool init_alibi_lookup) { + void init(size_t H, + size_t S, + size_t SV, + size_t Hk, + size_t h_each_group_len, + size_t block_size, + size_t sliding_window, + float d_scale, + size_t kv_len, + bool init_alibi_lookup) { // query shape: [B, H, L, S] // present_key shape: [block, H, 32, S] // Q*K': [M1, S] * [M2, S]' @@ -853,25 +911,27 @@ struct MHAHelper { _weight.stride(2), false, in_type); - _wv_gemm[i] = std::make_shared(i + 1, - _SV, - _block_size, - // if it's bf16, the stride needs double due to reuse float buffer - (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), - _SV, - _output.stride(1), - false, - in_type); - _wv_gemm_acc[i] = std::make_shared(i + 1, - _SV, - _block_size, - // if it's bf16, the stride needs double due to reuse float buffer - (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), - _SV, - _output.stride(1), - false, - in_type, - true); + _wv_gemm[i] = + std::make_shared(i + 1, + _SV, + _block_size, + // if it's bf16, the stride needs double due to reuse float buffer + (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), + _SV, + _output.stride(1), + false, + in_type); + _wv_gemm_acc[i] = + std::make_shared(i + 1, + _SV, + _block_size, + // if it's bf16, the stride needs double due to reuse float buffer + (in_type == ov::element::Type_t::f32 ? 1 : 2) * _weight.stride(2), + _SV, + _output.stride(1), + false, + in_type, + true); } // wsp is used to compute beta when K is blocked @@ -879,8 +939,10 @@ struct MHAHelper { _wsp.resize(_nthr * _wsp_size_per_thread); // allocate scratch a/b, notice get_scratch_a_size/get_scratch_b_size returns in bytes - _qk_scratch_a.resize({_nthr, _qk_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); - _wv_scratch_a.resize({_nthr, _wv_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); + _qk_scratch_a.resize( + {_nthr, _qk_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); + _wv_scratch_a.resize( + {_nthr, _wv_gemm[_block_size - 1]->get_scratch_a_size() / sizeof(DATA_TYPE)}); if ((S % 32 == 0) && (block_size % 16 == 0) && (S <= 32 * 6)) { if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::amx_bf16) && @@ -894,14 +956,16 @@ struct MHAHelper { } } if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16) && !_gemv) { - _gemv = std::make_shared(static_cast(S), static_cast(block_size), _fastpath_valid_prec); + _gemv = std::make_shared(static_cast(S), + static_cast(block_size), + _fastpath_valid_prec); } } if (init_alibi_lookup && (!_alibi_lookup || _alibi_lookup.m_dims[0] < kv_len)) { _alibi_lookup.resize({kv_len * 2}); for (size_t i = 0; i < _alibi_lookup.m_dims[0]; i++) - _alibi_lookup.ptr()[i] = - static_cast((_alibi_lookup.m_dims[0] - 1 - i)); + _alibi_lookup.ptr()[i] = -static_cast((_alibi_lookup.m_dims[0] - 1 - i)); } } @@ -937,16 +1001,28 @@ struct MHAHelper { // output_emb: [L, H * S] // qk_scratch_b: [rnd_up(kv_len, block_size), Hk, scratch_b_size] // wv_scratch_b: [rnd_up(kv_len, block_size), Hk, scratch_b_size] - void exec_kernel_multiple(const PlainTensor& query, const PlainTensor& present_value, const PlainTensor& output_emb, - const PlainTensor& qk_scratch_b, const PlainTensor& wv_scratch_b, const int32_t* block_table, size_t ithr, size_t q_blk, - size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { + void exec_kernel_multiple(const PlainTensor& query, + const PlainTensor& present_value, + const PlainTensor& output_emb, + const PlainTensor& qk_scratch_b, + const PlainTensor& wv_scratch_b, + const int32_t* block_table, + size_t ithr, + size_t q_blk, + size_t hq_beg, + size_t hq_end, + size_t hk, + size_t q_len, + size_t cur_kv_len, + const PlainTensor& alibi_slopes, + float* score_output) { auto q_start = q_blk * _block_size; auto q_end = std::min(q_start + _block_size, q_len); auto q_cnt = q_end - q_start; constexpr bool q_is_xf16 = one_of(precision_of::value, ov::element::bf16, ov::element::f16); constexpr bool q_cache_is_same = precision_of::value == precision_of::value; auto cur_kv_len_blocks = div_up(cur_kv_len, _block_size); - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { auto* q_ptr = query.ptr(h, q_start, 0); float* c_ptr = _weight.ptr(ithr, h, 0, 0); // for each query block, loop through all key block @@ -1012,13 +1088,16 @@ struct MHAHelper { alibi_slope); } if (score_output) { - cvt_copy(score_output + h * rnd_up(cur_kv_len, 16), reinterpret_cast(score), cur_kv_len); + cvt_copy(score_output + h * rnd_up(cur_kv_len, 16), + reinterpret_cast(score), + cur_kv_len); } } // reuse float buffer, need to use float to compute offset auto* w_ptr = reinterpret_cast(_weight.ptr(ithr, h, 0, 0)); - float* fp32_out_ptr = q_is_xf16 ? _output.ptr(ithr, 0, h, 0) : output_emb.ptr(q_start, h * _SV); + float* fp32_out_ptr = + q_is_xf16 ? _output.ptr(ithr, 0, h, 0) : output_emb.ptr(q_start, h * _SV); // for each weight block, loop through all value block for (size_t v_blk = 0; v_blk < cur_kv_len_blocks; v_blk++) { @@ -1036,12 +1115,13 @@ struct MHAHelper { _wsp.data() + ithr * _wsp_size_per_thread, _wv_scratch_a ? _wv_scratch_a.ptr(ithr, 0) : nullptr); } else { - _wv_gemm_acc[q_cnt - 1]->executeGemm(q_cnt < _block_size, - w_ptr + v_blk * _block_size, - v_ptr, - fp32_out_ptr, - _wsp.data() + ithr * _wsp_size_per_thread, - _wv_scratch_a ? _wv_scratch_a.ptr(ithr, 0) : nullptr); + _wv_gemm_acc[q_cnt - 1]->executeGemm( + q_cnt < _block_size, + w_ptr + v_blk * _block_size, + v_ptr, + fp32_out_ptr, + _wsp.data() + ithr * _wsp_size_per_thread, + _wv_scratch_a ? _wv_scratch_a.ptr(ithr, 0) : nullptr); } } if (q_is_xf16) { @@ -1064,16 +1144,28 @@ struct MHAHelper { // output_emb: [L, H * S] // weight: [nthr, H, 32, rnd_up(kv_len, block_size)] // output: [nthr, 32, H, S] - void exec_kernel_one_bh(const PlainTensor& query, const PlainTensor& present_key, const PlainTensor& present_value, const PlainTensor& output_emb, - const int32_t* block_table, size_t ithr, size_t hk, size_t q_len, size_t cur_kv_len, const PlainTensor& alibi_slopes, float* score_output) { + void exec_kernel_one_bh(const PlainTensor& query, + const PlainTensor& present_key, + const PlainTensor& present_value, + const PlainTensor& output_emb, + const int32_t* block_table, + size_t ithr, + size_t hq_beg, + size_t hq_end, + size_t hk, + size_t q_len, + size_t cur_kv_len, + const PlainTensor& alibi_slopes, + float* score_output) { if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - (*_gemv)(query.ptr(h, pq), present_key.ptr(block_number, hk), - _weight.ptr(ithr, h, pq) + pk); + for (size_t h = hq_beg; h < hq_end; h++) { + (*_gemv)(query.ptr(h, pq), + present_key.ptr(block_number, hk), + _weight.ptr(ithr, h, pq) + pk); } } } @@ -1082,16 +1174,19 @@ struct MHAHelper { for (size_t pk = 0, i = 0; pk < cur_kv_len; pk += _block_size, i++) { auto block_number = block_table[i]; for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - dot_product_block(query.ptr(h, pq), present_key.ptr(block_number, hk), - _weight.ptr(ithr, h, pq) + pk, _S, std::min(_block_size, cur_kv_len - pk)); + for (size_t h = hq_beg; h < hq_end; h++) { + dot_product_block(query.ptr(h, pq), + present_key.ptr(block_number, hk), + _weight.ptr(ithr, h, pq) + pk, + _S, + std::min(_block_size, cur_kv_len - pk)); } } } } for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { // apply attention mask & sofmax float* alibi_lookup = nullptr; float alibi_slope = 0.f; @@ -1112,7 +1207,9 @@ struct MHAHelper { ov::element::f32, alibi_slope); if (score_output) { - memcpy(score_output + h * rnd_up(cur_kv_len, 16), _weight.ptr(ithr, h, pq), cur_kv_len * sizeof(float)); + memcpy(score_output + h * rnd_up(cur_kv_len, 16), + _weight.ptr(ithr, h, pq), + cur_kv_len * sizeof(float)); } } } @@ -1122,7 +1219,7 @@ struct MHAHelper { auto block_number = block_table[i]; auto* v = present_value.ptr(block_number, hk); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { attn_acc_value_block(_output.ptr(ithr, pq, h), _weight.ptr(ithr, h, pq) + pv, v, @@ -1133,13 +1230,13 @@ struct MHAHelper { } // convert to dst for (size_t pq = 0; pq < q_len; pq++) - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) + for (size_t h = hq_beg; h < hq_end; h++) cvt_copy(output_emb.ptr(pq, h * _SV), _output.ptr(ithr, pq, h), _SV); } - // compute one token, loop along batch, head dimensions and kv_len, it's special for very long kv_len with small batch tokens. - // It will assume NO mixture execution of first and second token. - // all tensors such as query... have batch dimension which is DIFFERENT from above + // compute one token, loop along batch, head dimensions and kv_len, it's special for very long kv_len with small + // batch tokens. It will assume NO mixture execution of first and second token. all tensors such as query... have + // batch dimension which is DIFFERENT from above // query: [B, H, L, S] // present_*: [block_number, H, 32, S] // output_emb: [B, L, H * S] @@ -1162,8 +1259,39 @@ struct MHAHelper { // aligned to cache line (64bytes=16*sizeof(float)) to avoid false sharing _weight_bhl.resize({B, _H, q_len, rnd_up(max_context_len, std::max(_block_size, size_t{16}))}); - parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pk_in_blocks, size_t hk) { + // for small batches dynamic scheduler has notable overhead + bool prefer_static_loop; + // if less than 2 work items per thread, loop H + bool loop_hk = B * kv_len_in_blocks * _Hk <= 2 * _nthr ? false : true; + if (B <= 32) { + prefer_static_loop = true; + // small batch and all batch size is same(like SDPA case) + auto kv_len = past_lens.ptr()[0]; + for (size_t b = 1; b < B; b++) { + if (past_lens.ptr()[b] != kv_len) + prefer_static_loop = false; + } + } else { + // for bigger batch skip the test to save the cost + prefer_static_loop = false; + } + auto get_h_params = + [](bool loop_hk, size_t hx, size_t h_each_group_len, size_t& hq_beg, size_t& hq_end, size_t& hk) { + if (loop_hk) { + hk = hx; + hq_beg = hk * h_each_group_len; + hq_end = (hk + 1) * h_each_group_len; + } else { + hq_beg = hx; + hq_end = hx + 1; + hk = hx / h_each_group_len; + } + }; + auto loop_qk = [&](size_t b, size_t pk_in_blocks, size_t hx) { auto context_len = static_cast(past_lens.ptr()[b]) + 1; + size_t hk, hq_beg, hq_end; + get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk); + // kv_len must be valid auto pk = pk_in_blocks * _block_size; if (pk < context_len) { @@ -1171,24 +1299,28 @@ struct MHAHelper { if (one_of(_fastpath_valid_prec, ov::element::bf16, ov::element::f16)) { _gemv->tile_config(); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - (*_gemv)(query.ptr(b, h, pq), present_key.ptr(block_number, hk), - _weight_bhl.ptr(b, h, pq) + pk); + for (size_t h = hq_beg; h < hq_end; h++) { + (*_gemv)(query.ptr(b, h, pq), + present_key.ptr(block_number, hk), + _weight_bhl.ptr(b, h, pq) + pk); } } _gemv->tile_release(); } else { for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { - dot_product_block(query.ptr(b, h, pq), present_key.ptr(block_number, hk), - _weight_bhl.ptr(b, h, pq) + pk, _S, std::min(_block_size, context_len - pk)); + for (size_t h = hq_beg; h < hq_end; h++) { + dot_product_block(query.ptr(b, h, pq), + present_key.ptr(block_number, hk), + _weight_bhl.ptr(b, h, pq) + pk, + _S, + std::min(_block_size, context_len - pk)); } } } } - }); + }; - parallel_for3d_dynamic(B, _H, q_len, [&](size_t b, size_t h, size_t pq) { + auto loop_softmax = [&](size_t b, size_t h, size_t pq) { auto cur_kv_len = static_cast(past_lens.ptr()[b]) + 1; auto ncausal = cur_kv_len; // apply attention mask & sofmax @@ -1210,7 +1342,16 @@ struct MHAHelper { ov::element::f32, ov::element::f32, alibi_slope); - }); + }; + + size_t h_dims = loop_hk ? _Hk : _H; + if (prefer_static_loop) { + parallel_for3d(B, kv_len_in_blocks, h_dims, loop_qk); + parallel_for3d(B, _H, q_len, loop_softmax); + } else { + parallel_for3d_dynamic(B, kv_len_in_blocks, h_dims, loop_qk); + parallel_for3d_dynamic(B, _H, q_len, loop_softmax); + } if (output_score) { parallel_for2d_dynamic(B, q_len, [&](size_t b, size_t pq) { @@ -1229,16 +1370,19 @@ struct MHAHelper { memset(_output_bhl.ptr(ithr, 0, 0, 0, 0), 0, _output_bhl.stride(0) * sizeof(float)); }); - parallel_for3d_dynamic(B, kv_len_in_blocks, _Hk, [&](size_t b, size_t pv_in_blocks, size_t hk) { + auto loop_wk = [&](size_t b, size_t pv_in_blocks, size_t hx) { auto ithr = parallel_get_thread_num(); auto context_len = static_cast(past_lens.ptr()[b]) + 1; auto pv = pv_in_blocks * _block_size; + size_t hk, hq_beg, hq_end; + get_h_params(loop_hk, hx, _h_each_group_len, hq_beg, hq_end, hk); + // kv_len must be valid if (pv < context_len) { auto block_number = block_indices.ptr()[block_indices_begins.ptr()[b] + pv_in_blocks]; auto* v = present_value.ptr(block_number, hk); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = hk * _h_each_group_len; h < (hk + 1) * _h_each_group_len; h++) { + for (size_t h = hq_beg; h < hq_end; h++) { attn_acc_value_block(_output_bhl.ptr(ithr, b, pq, h), _weight_bhl.ptr(b, h, pq) + pv, v, @@ -1247,7 +1391,13 @@ struct MHAHelper { } } } - }); + }; + + if (prefer_static_loop) { + parallel_for3d(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk); + } else { + parallel_for3d_dynamic(B, kv_len_in_blocks, loop_hk ? _Hk : _H, loop_wk); + } parallel_for3d(B, _H, q_len, [&](size_t b, size_t h, size_t pq) { auto* temp = _output_bhl.ptr(0, b, pq, h); @@ -1262,26 +1412,29 @@ template struct MHA { MHAHelper& _helper; struct AttnWorkItem { - int32_t batch_in_reorder; // which batch in reorder buffer will be used - int32_t batch_in_seq; // batch idx in sequence - int32_t q_len; // current sequence length, 1 for second token, 2+ for first token - int32_t q_block_id; // block id in this seq, valid at first token + int32_t batch_in_reorder; // which batch in reorder buffer will be used + int32_t batch_in_seq; // batch idx in sequence + int32_t q_len; // current sequence length, 1 for second token, 2+ for first token + int32_t q_block_id; // block id in this seq, valid at first token }; struct ReorderWorkItem { - int32_t batch_in_seq; // batch idx in sequence - int32_t batch_in_reorder; // which batch in reorder buffer will be used - int32_t kv_block_id; // block id in this kv cache seq + int32_t batch_in_seq; // batch idx in sequence + int32_t batch_in_reorder; // which batch in reorder buffer will be used + int32_t kv_block_id; // block id in this kv cache seq }; struct WorkItems { private: std::vector attn_items; std::vector reorder_items; - int32_t max_kv_len_in_reorder; // max kv len between first tokens + int32_t max_kv_len_in_reorder; // max kv len between first tokens int32_t max_batch_in_reorder; int32_t total_kv_len; public: - void reset(const PlainTensor& query, const PlainTensor& past_lens, const PlainTensor& subsequence_begins, size_t block_size) { + void reset(const PlainTensor& query, + const PlainTensor& past_lens, + const PlainTensor& subsequence_begins, + size_t block_size) { attn_items.clear(); reorder_items.clear(); max_kv_len_in_reorder = 0; @@ -1294,21 +1447,19 @@ struct MHA { auto kv_len = past_lens.ptr()[i] + q_len; auto kv_len_in_block = static_cast(div_up(kv_len, block_size)); if (q_len == 1) { - attn_items.emplace_back(AttnWorkItem{ - 0, // batch_in_reorder - i, // batch_in_seq - 1ull, // q_len - // kv_len in blocks, used in the sort function - kv_len_in_block - 1 - }); + attn_items.emplace_back(AttnWorkItem{0, // batch_in_reorder + i, // batch_in_seq + 1ull, // q_len + // kv_len in blocks, used in the sort function + kv_len_in_block - 1}); } else { auto reorder_sub_work_count = kv_len_in_block; max_kv_len_in_reorder = std::max(max_kv_len_in_reorder, kv_len); for (int32_t block_id = 0; block_id < reorder_sub_work_count; block_id++) { reorder_items.emplace_back(ReorderWorkItem{ - i, // batch_in_seq - max_batch_in_reorder, // batch_in_reorder - block_id // kv_block_id + i, // batch_in_seq + max_batch_in_reorder, // batch_in_reorder + block_id // kv_block_id }); } @@ -1316,17 +1467,18 @@ struct MHA { auto attn_sub_work_count = static_cast(div_up(q_len, block_size)); for (int32_t block_id = 0; block_id < attn_sub_work_count; block_id++) { attn_items.emplace_back(AttnWorkItem{ - max_batch_in_reorder, // batch_in_reorder - i, // batch_in_seq - q_len, // q_len - block_id // q_block_id + max_batch_in_reorder, // batch_in_reorder + i, // batch_in_seq + q_len, // q_len + block_id // q_block_id }); } max_batch_in_reorder++; } total_kv_len += kv_len; } - // std::sort(attn_items.begin(), attn_items.end(), [] (const AttnWorkItem& left, const AttnWorkItem& right) { + // std::sort(attn_items.begin(), attn_items.end(), [] (const AttnWorkItem& left, const AttnWorkItem& right) + // { // // kv block number which will be acessed later // auto left_kv_blocks = left.q_block_id; // auto right_kv_blocks = right.q_block_id; @@ -1380,7 +1532,8 @@ struct MHA { auto reorder_work_count = _workitems.reorder_work_size(); // buffer for transpose and repack - _helper.init_reorder_buffers(_workitems.get_reorder_max_batch_size(), div_up(_workitems.get_reorder_max_kv_len(), _helper._block_size)); + _helper.init_reorder_buffers(_workitems.get_reorder_max_batch_size(), + div_up(_workitems.get_reorder_max_kv_len(), _helper._block_size)); // packed k, v parallel_for2d_dynamic(reorder_work_count, Hk, [&](size_t w, size_t hk) { @@ -1388,7 +1541,8 @@ struct MHA { const auto batch_in_seq = item.batch_in_seq; const auto batch_in_reorder = item.batch_in_reorder; const auto kv_block = item.kv_block_id; - auto block_number = block_indices.ptr()[block_indices_begins.ptr()[batch_in_seq] + kv_block]; + auto block_number = + block_indices.ptr()[block_indices_begins.ptr()[batch_in_seq] + kv_block]; if (block_number < 0) return; @@ -1396,10 +1550,12 @@ struct MHA { auto* k_ptr = k_cache.ptr(block_number, hk); auto* v_ptr = v_cache.ptr(block_number, hk); transpose_16NxK(_helper._qk_scratch_b.template ptr(batch_in_reorder, kv_block, hk), - k_ptr, - _helper._output.template ptr(ithr), - _helper._block_size, - _helper._S, _helper._block_size, _helper._S); + k_ptr, + _helper._output.template ptr(ithr), + _helper._block_size, + _helper._S, + _helper._block_size, + _helper._S); if (q_is_xf16) { pack_32NxK(_helper._wv_scratch_b.template ptr(batch_in_reorder, kv_block, hk), v_ptr, @@ -1411,12 +1567,34 @@ struct MHA { } else { // need to decompress if (!q_cache_is_same) { - dequant(_helper._wv_scratch_b.template ptr(batch_in_reorder, kv_block, hk), v_ptr, _helper._block_size, _helper._SV); + dequant(_helper._wv_scratch_b.template ptr(batch_in_reorder, kv_block, hk), + v_ptr, + _helper._block_size, + _helper._SV); } } }); - parallel_for2d_dynamic(attn_work_count, Hk, [&](size_t w, size_t hk) { + // loop along HK dimension: if mixed first/second token and elements count is enough, loop HK to reuse KV in the + // CPU cache + // else if elements count is small, prefer to loop H to get more work to avoid thread imbalance + bool loop_hk = _workitems.get_reorder_max_batch_size() == past_lens.m_dims[0] || // if only first token, loop H + attn_work_count * Hk <= 2 * _helper._nthr + ? false + : true; // or less than 2 work items per thread, loop H + + parallel_for2d_dynamic(attn_work_count, loop_hk ? Hk : _helper._H, [&](size_t w, size_t hx) { + size_t hk, hq_beg, hq_end; + if (loop_hk) { + hk = hx; + hq_beg = hk * _helper._h_each_group_len; + hq_end = (hk + 1) * _helper._h_each_group_len; + } else { + hq_beg = hx; + hq_end = hx + 1; + hk = hx / _helper._h_each_group_len; + } + const auto& item = _workitems.get_attn_work_item(w); const auto batch_in_seq = item.batch_in_seq; const auto batch_in_token = subsequence_begins.ptr()[batch_in_seq]; @@ -1431,16 +1609,26 @@ struct MHA { score_output = _helper._score_output.template ptr() + score_offset * _helper._H; } - _helper.exec_kernel_one_bh(q.slice(0, batch_in_token, batch_in_token), k_cache, v_cache, + _helper.exec_kernel_one_bh( + q.slice(0, batch_in_token, batch_in_token), + k_cache, + v_cache, output_emb.slice(0, batch_in_token, batch_in_token), block_indices.ptr() + block_indices_begins.ptr()[batch_in_seq], - ithr, hk, 1ul, cur_kv_len, alibi_slopes, + ithr, + hq_beg, + hq_end, + hk, + 1ul, + cur_kv_len, + alibi_slopes, score_output); } else { const auto batch_in_reorder = item.batch_in_reorder; const auto q_blk = item.q_block_id; const auto q_cnt = std::min(_helper._block_size, q_len - q_blk * _helper._block_size); - const auto cur_kv_len = static_cast(past_lens.ptr()[batch_in_seq]) + q_blk * _helper._block_size + q_cnt; + const auto cur_kv_len = + static_cast(past_lens.ptr()[batch_in_seq]) + q_blk * _helper._block_size + q_cnt; float* score_output = nullptr; if (output_score) { // last block @@ -1453,14 +1641,18 @@ struct MHA { PlainTensor sub_query; sub_query.resize({q_len, _helper._H, _helper._S}, q.ptr(batch_in_token)); sub_query = sub_query.permute({1, 0, 2}); - _helper.exec_kernel_multiple(sub_query, + _helper.exec_kernel_multiple( + sub_query, v_cache, - output_emb.slice(0, batch_in_token, batch_in_token + q_len).reshape({q_len, _helper._H * _helper._SV}), + output_emb.slice(0, batch_in_token, batch_in_token + q_len) + .reshape({q_len, _helper._H * _helper._SV}), _helper._qk_scratch_b.slice(0, batch_in_reorder, batch_in_reorder), _helper._wv_scratch_b.slice(0, batch_in_reorder, batch_in_reorder), block_indices.ptr() + block_indices_begins.ptr()[batch_in_seq], ithr, q_blk, + hq_beg, + hq_end, hk, q_len, cur_kv_len, @@ -1470,7 +1662,8 @@ struct MHA { }); if (output_score) { parallel_for2d_dynamic(past_lens.m_dims[0], 1, [&](size_t b, size_t pq) { - auto seq_len = static_cast(subsequence_begins.ptr()[b + 1] - subsequence_begins.ptr()[b]); + auto seq_len = static_cast(subsequence_begins.ptr()[b + 1] - + subsequence_begins.ptr()[b]); auto cur_kv_len = static_cast(past_lens.ptr()[b]) + seq_len; auto src_offset = _helper._score_offsets_aligned.template ptr()[b]; auto* src = _helper._score_output.template ptr() + src_offset * _helper._H; @@ -1501,11 +1694,29 @@ struct MHA { auto nthr = static_cast(parallel_get_max_threads()); if (past_lens.m_dims[0] >= nthr || _workitems.get_reorder_max_batch_size() > 0) { - exec_loop_mixed(query, present_key, present_value, output_emb, output_score, max_context_len, past_lens, subsequence_begins, - block_indices, block_indices_begins, alibi_slopes); + exec_loop_mixed(query, + present_key, + present_value, + output_emb, + output_score, + max_context_len, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + alibi_slopes); } else { - _helper.exec_loop_bhl(query, present_key, present_value, output_emb, output_score, max_context_len, past_lens, subsequence_begins, - block_indices, block_indices_begins, alibi_slopes); + _helper.exec_loop_bhl(query, + present_key, + present_value, + output_emb, + output_score, + max_context_len, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + alibi_slopes); } } }; @@ -1518,18 +1729,32 @@ struct AttentionExecutor : public PagedAttentionExecutor { AttentionExecutor() : _kernel(_helper) {} - void init(const std::vector& inputs, const std::vector& outputs, PlainTensor& q, PlainTensor& k, PlainTensor& v, PlainTensor& k_cache, - PlainTensor& v_cache, PlainTensor& past_lens, PlainTensor& subsequence_begins, PlainTensor& block_indices, PlainTensor& block_indices_begins, - float& scale, size_t& sliding_window, PlainTensor& alibi_slopes, size_t& max_context_len, PlainTensor& output_emb, PlainTensor& output_score) { - q.reset(inputs[ID_Q]); // [B_token, H * S] + void init(const std::vector& inputs, + const std::vector& outputs, + PlainTensor& q, + PlainTensor& k, + PlainTensor& v, + PlainTensor& k_cache, + PlainTensor& v_cache, + PlainTensor& past_lens, + PlainTensor& subsequence_begins, + PlainTensor& block_indices, + PlainTensor& block_indices_begins, + float& scale, + size_t& sliding_window, + PlainTensor& alibi_slopes, + size_t& max_context_len, + PlainTensor& output_emb, + PlainTensor& output_score) { + q.reset(inputs[ID_Q]); // [B_token, H * S] k.reset(inputs[ID_K]); v.reset(inputs[ID_V]); - k_cache.reset(inputs[ID_KCACHE]); // [NUM_BLOCKS, H, 32, S] - v_cache.reset(inputs[ID_VCACHE]); // [NUM_BLOCKS, H, 32, S] - past_lens.reset(inputs[ID_PAST_LENS]); // [B_seq] - subsequence_begins.reset(inputs[ID_SUBSEQUENCE_BEGINS]); // [B_seq+1] - block_indices.reset(inputs[ID_BLOCK_INDICES]); // [num_blocks] - block_indices_begins.reset(inputs[ID_BLOCK_INDICES_BEGINS]);// [B_seq+1] + k_cache.reset(inputs[ID_KCACHE]); // [NUM_BLOCKS, H, 32, S] + v_cache.reset(inputs[ID_VCACHE]); // [NUM_BLOCKS, H, 32, S] + past_lens.reset(inputs[ID_PAST_LENS]); // [B_seq] + subsequence_begins.reset(inputs[ID_SUBSEQUENCE_BEGINS]); // [B_seq+1] + block_indices.reset(inputs[ID_BLOCK_INDICES]); // [num_blocks] + block_indices_begins.reset(inputs[ID_BLOCK_INDICES_BEGINS]); // [B_seq+1] scale = *inputs[ID_SCALE]->getDataAs(); sliding_window = static_cast(*inputs[ID_SLIDING_WINDOW]->getDataAs()); if (!inputs[ID_ALIBI_SLOPES]->getShape().hasZeroDims()) @@ -1542,8 +1767,8 @@ struct AttentionExecutor : public PagedAttentionExecutor { auto B_token = q.size(0); auto Hk = k_cache.size(1); // The layout for per token per head for u8 kv cache: - // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized feature(u8,idx_S)| - // The actual size needs to deduct scale and zeropoint. + // |scale(f32)|zeropoint(f32)|quantized feature(u8,idx_1)|quantized feature(u8,idx_2)|...|quantized + // feature(u8,idx_S)| The actual size needs to deduct scale and zeropoint. auto S = k_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0); auto SV = v_cache.size(3) - (k_cache.m_dt == ov::element::Type_t::u8 ? sizeof(float) * 2 : 0); auto block_size = k_cache.size(2); @@ -1585,8 +1810,14 @@ struct AttentionExecutor : public PagedAttentionExecutor { _helper.init(H, S, SV, Hk, h_each_group_len, block_size, sliding_window, scale, max_context_len, alibi_slopes); } - void concat_pastkv(const PlainTensor& k, const PlainTensor& v, const PlainTensor& k_cache, const PlainTensor& v_cache, - const PlainTensor& past_lens, const PlainTensor& subsequence_begins, const PlainTensor& block_indices, const PlainTensor& block_indices_begins) { + void concat_pastkv(const PlainTensor& k, + const PlainTensor& v, + const PlainTensor& k_cache, + const PlainTensor& v_cache, + const PlainTensor& past_lens, + const PlainTensor& subsequence_begins, + const PlainTensor& block_indices, + const PlainTensor& block_indices_begins) { auto B_token = k.size(0); _slot_mapping.resize({B_token}); @@ -1598,8 +1829,10 @@ struct AttentionExecutor : public PagedAttentionExecutor { auto block_offset_start = kv_len - q_len; for (int32_t j = 0; j < q_len; j++) { auto block_offset = block_offset_start + j; - auto block_number = block_indices.ptr()[block_number_start + block_offset / _helper._block_size]; - _slot_mapping.ptr()[idx++] = block_number * _helper._block_size + block_offset % _helper._block_size; + auto block_number = + block_indices.ptr()[block_number_start + block_offset / _helper._block_size]; + _slot_mapping.ptr()[idx++] = + block_number * _helper._block_size + block_offset % _helper._block_size; } } @@ -1620,12 +1853,36 @@ struct AttentionExecutor : public PagedAttentionExecutor { PlainTensor output_emb; PlainTensor output_score; - init(inputs, outputs, q, k, v, k_cache, v_cache, past_lens, subsequence_begins, block_indices, block_indices_begins, - scale, sliding_window, alibi_slopes, max_context_len, output_emb, output_score); + init(inputs, + outputs, + q, + k, + v, + k_cache, + v_cache, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + scale, + sliding_window, + alibi_slopes, + max_context_len, + output_emb, + output_score); concat_pastkv(k, v, k_cache, v_cache, past_lens, subsequence_begins, block_indices, block_indices_begins); - _kernel(q, k_cache, v_cache, output_emb, output_score, max_context_len, past_lens, subsequence_begins, block_indices, - block_indices_begins, alibi_slopes); + _kernel(q, + k_cache, + v_cache, + output_emb, + output_score, + max_context_len, + past_lens, + subsequence_begins, + block_indices, + block_indices_begins, + alibi_slopes); } }; #endif @@ -1635,27 +1892,27 @@ std::shared_ptr make_pa_executor(ov::element::Type data_ #ifdef OPENVINO_ARCH_X86_64 if (data_type == ov::element::bf16) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) if (kvcache_type == ov::element::u8) { executor = std::make_shared>(); } else { OPENVINO_ASSERT(kvcache_type == ov::element::bf16, "expect kvcache type bf16, current: ", kvcache_type); executor = std::make_shared>(); } -#else +# else OPENVINO_THROW("make_pa_executor: bf16 needs avx512+ hardware."); -#endif +# endif } else if (data_type == ov::element::f16) { -#if defined(HAVE_AVX512F) +# if defined(HAVE_AVX512F) if (kvcache_type == ov::element::u8) { executor = std::make_shared>(); } else { OPENVINO_ASSERT(kvcache_type == ov::element::f16, "expect kvcache type f16, current: ", kvcache_type); executor = std::make_shared>(); } -#else - OPENVINO_THROW("make_pa_executor: f16 needs avx512+ hardware."); -#endif +# else + OPENVINO_THROW("make_pa_executor: f16 needs avx512+ hardware."); +# endif } else if (data_type == ov::element::f32) { if (kvcache_type == ov::element::u8) { executor = std::make_shared>(); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp index ed779dee13c96d..d28125b3898460 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.hpp @@ -6,8 +6,9 @@ #include #include #include -#include #include +#include + #include "cpu_memory.h" #include "executor_pa_common.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp index 70723a577b0c2b..8a7fa211f8f4ce 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.cpp @@ -1,6 +1,8 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include "executor_pa_common.hpp" + #include #include @@ -9,10 +11,9 @@ #include #include +#include "openvino/core/parallel.hpp" #include "openvino/core/type/bfloat16.hpp" #include "openvino/core/type/float16.hpp" -#include "openvino/core/parallel.hpp" -#include "executor_pa_common.hpp" #include "utils/plain_tensor.hpp" namespace ov { @@ -58,20 +59,23 @@ void TileConfiger::generate() { ret(); } -JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec) : - jit_generator(jit_name()), m_head_size(head_size), m_block_size(block_size), m_amx_prec(amx_prec) { +JitMatMulVecAMX::JitMatMulVecAMX(int head_size, int block_size, ov::element::Type amx_prec) + : jit_generator(jit_name()), + m_head_size(head_size), + m_block_size(block_size), + m_amx_prec(amx_prec) { create_kernel(); m_tile_cfg.reset(1, 0, { - {16, 4}, // C:0 M x 1 (4b) - {16, 64}, // A:1 M x 32/64 (64b) - {16, 4}, // B:2 32/64 x 1 (4b) - {16, 4}, // B:3 - {16, 4}, // B:4 - {16, 4}, // B:5 - {16, 4}, // B:6 - {16, 4}, // B:7 + {16, 4}, // C:0 M x 1 (4b) + {16, 64}, // A:1 M x 32/64 (64b) + {16, 4}, // B:2 32/64 x 1 (4b) + {16, 4}, // B:3 + {16, 4}, // B:4 + {16, 4}, // B:5 + {16, 4}, // B:6 + {16, 4}, // B:7 }); } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp index bc21457a3285b4..81c54c84d9453a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa_common.hpp @@ -6,11 +6,12 @@ #include #include #include -#include #include -#include "cpu_memory.h" +#include + #include "cpu/x64/cpu_isa_traits.hpp" #include "cpu/x64/jit_generator.hpp" +#include "cpu_memory.h" namespace ov { namespace Extensions { @@ -20,20 +21,21 @@ namespace Cpu { struct PagedAttentionExecutor { // PagedAttention input index - static const size_t ID_Q = 0; // [B_token, H * S], float - static const size_t ID_K = 1; // [B_token, Hk * S], float - static const size_t ID_V = 2; // [B_token, Hk * S], float - static const size_t ID_KCACHE = 3; // [block_number, H, block_size, S], float - static const size_t ID_VCACHE = 4; // [block_number, H, block_size, S], float - static const size_t ID_PAST_LENS = 5; // [B_seq] - static const size_t ID_SUBSEQUENCE_BEGINS = 6; // [B_seq+1] - static const size_t ID_BLOCK_INDICES = 7; // [num_blocks] - static const size_t ID_BLOCK_INDICES_BEGINS = 8; // [B_seq+1] - static const size_t ID_SCALE = 9; // [], float - static const size_t ID_SLIDING_WINDOW = 10; // [] - static const size_t ID_ALIBI_SLOPES = 11; // [H|0], float - static const size_t ID_MAX_CONTEXT_LEN = 12; // [] - virtual void execute(const std::vector& inputs, const std::vector outputs) = 0; + static const size_t ID_Q = 0; // [B_token, H * S], float + static const size_t ID_K = 1; // [B_token, Hk * S], float + static const size_t ID_V = 2; // [B_token, Hk * S], float + static const size_t ID_KCACHE = 3; // [block_number, H, block_size, S], float + static const size_t ID_VCACHE = 4; // [block_number, H, block_size, S], float + static const size_t ID_PAST_LENS = 5; // [B_seq] + static const size_t ID_SUBSEQUENCE_BEGINS = 6; // [B_seq+1] + static const size_t ID_BLOCK_INDICES = 7; // [num_blocks] + static const size_t ID_BLOCK_INDICES_BEGINS = 8; // [B_seq+1] + static const size_t ID_SCALE = 9; // [], float + static const size_t ID_SLIDING_WINDOW = 10; // [] + static const size_t ID_ALIBI_SLOPES = 11; // [H|0], float + static const size_t ID_MAX_CONTEXT_LEN = 12; // [] + virtual void execute(const std::vector& inputs, + const std::vector outputs) = 0; virtual ~PagedAttentionExecutor() = default; }; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp index 25ddbb1b4246b1..f2180b5314cc07 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.cpp @@ -13,11 +13,10 @@ # include #endif - -#include "openvino/core/type/bfloat16.hpp" -#include "openvino/core/parallel.hpp" -#include "mha_single_token.hpp" #include "common.hpp" +#include "mha_single_token.hpp" +#include "openvino/core/parallel.hpp" +#include "openvino/core/type/bfloat16.hpp" #include "softmax_kernel.hpp" #if defined(OPENVINO_ARCH_ARM64) @@ -33,19 +32,20 @@ using namespace ov; #if defined(HAVE_AVX2) -#define prefetch_bytes(bytes, sel, advance, src) { \ - auto *p = reinterpret_cast(src); \ - for (size_t i = 0; i < bytes; i += 64) \ - _mm_prefetch(p + i + advance, sel); \ -} +# define prefetch_bytes(bytes, sel, advance, src) \ + { \ + auto* p = reinterpret_cast(src); \ + for (size_t i = 0; i < bytes; i += 64) \ + _mm_prefetch(p + i + advance, sel); \ + } #else -#define prefetch_bytes(bytes, sel, advance, src) +# define prefetch_bytes(bytes, sel, advance, src) #endif -template +template void cvt_copy(TA* dst, TB* src, size_t n) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -65,21 +65,21 @@ void cvt_copy(TA* dst, TB* src, size_t n) { __vst1q_f32(dst + i, vb1); } } -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +# if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) if (std::is_same::value && std::is_same::value) { for (; i + vec_len_f16_neon <= n; i += vec_len_f16_neon) { auto vb1 = vld1q_f16(reinterpret_cast(src + i)); vst1q_f16(reinterpret_cast(dst + i), vb1); } } -#endif +# endif #endif for (; i < n; i++) { dst[i] = src[i]; } } -template +template static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scale, float* zp) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -113,12 +113,12 @@ static void attn_acc_value(float* out, float weight, T* v, size_t S, float* scal } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -template +template static void attn_acc_value(ov::float16* out, ov::float16 weight, T* v, size_t S, float* scale, float* zp) { size_t i = 0; auto attn_w_vec_fp16 = vdupq_n_f16(weight); - auto _v = reinterpret_cast(v); - auto _out = reinterpret_cast(out); + auto _v = reinterpret_cast(v); + auto _out = reinterpret_cast(out); for (; i + vec_len_f16_neon <= S; i += vec_len_f16_neon) { auto v_value = vld1q_f16(_v + i); auto v_out = vld1q_f16(_out + i); @@ -131,7 +131,6 @@ static void attn_acc_value(ov::float16* out, ov::float16 weight, T* v, size_t S, } #endif - static void attn_acc_value(float* out, float weight, uint8_t* v, size_t S, float* scale, float* zp) { size_t i = 0; weight *= *scale; @@ -285,7 +284,7 @@ static void attn_acc_value(float* out, float weight, uint8_t* v, size_t S, float } } -template +template static float sum_q_head(T* a, size_t n) { float sum = 0.0f; size_t i = 0; @@ -406,7 +405,7 @@ static float sum_q_head(T* a, size_t n) { return sum; } -template +template static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* head_sum) { size_t i = 0; float sum = 0.0f; @@ -552,7 +551,12 @@ static float dot_product(TA* a, TB* b, size_t n, float* scale, float* zp, float* } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -static ov::float16 dot_product_fp16(ov::float16* a, ov::float16* b, size_t n, float* scale, float* zp, float* head_sum) { +static ov::float16 dot_product_fp16(ov::float16* a, + ov::float16* b, + size_t n, + float* scale, + float* zp, + float* head_sum) { size_t i = 0; ov::float16 sum = 0.0f; auto vsum0 = vdupq_n_f16(0.0f); @@ -609,7 +613,7 @@ static ov::float16 dot_product_fp16(ov::float16* a, ov::float16* b, size_t n, fl } #endif -template +template static float dot_product(TA* a, uint8_t* b, size_t n, float* scale, float* zp, float* head_sum) { size_t i = 0; float sum = 0.0f; @@ -763,11 +767,11 @@ static float dot_product(TA* a, uint8_t* b, size_t n, float* scale, float* zp, f #endif } -template +template static void attn_reduce(T* dst, float* temp, size_t M, size_t S, size_t temp_stride) { size_t i = 0; #if defined(HAVE_AVX512F) - for (; i + vec_len_f32_avx512 <= S; i+= vec_len_f32_avx512) { + for (; i + vec_len_f32_avx512 <= S; i += vec_len_f32_avx512) { auto* src = temp + i; auto result_vec_fp32 = _mm512_setzero_ps(); for (size_t m = 0; m < M; m++) { @@ -903,11 +907,16 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, for (size_t iwork = start; iwork < end; ++iwork) { auto p = past_k_scale_zp.ptr(pk, 0, h_group); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - if (std::is_same::value && std::is_same::value && std::is_same::value) { + if (std::is_same::value && std::is_same::value && + std::is_same::value) { auto p_k = present_key.ptr(0, h_group, pk); prefetch_bytes(S, _MM_HINT_T0, 4096, p_k); - auto _qk = dot_product_fp16(query.ptr(0, h_group), p_k, - S, p, p + 1, head_sum.ptr(0, h_group)); + auto _qk = dot_product_fp16(query.ptr(0, h_group), + p_k, + S, + p, + p + 1, + head_sum.ptr(0, h_group)); buf_attn_w.ptr(0, h_group, 0)[pk] = _qk; parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); continue; @@ -915,8 +924,9 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, #endif auto p_k = present_key.ptr(0, h_group, pk); prefetch_bytes(S, _MM_HINT_T0, 4096, p_k); - buf_attn_w.ptr(0, h_group, 0)[pk] = dot_product(query.ptr(0, h_group), p_k, - S, p, p + 1, head_sum.ptr(0, h_group));; + buf_attn_w.ptr(0, h_group, 0)[pk] = + dot_product(query.ptr(0, h_group), p_k, S, p, p + 1, head_sum.ptr(0, h_group)); + ; parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); } } else { @@ -924,10 +934,15 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, auto b_kv = beams ? beams.ptr(b)[pk] : b; auto p = past_k_scale_zp.ptr(pk, b_kv, h_group); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - if (std::is_same::value && std::is_same::value && std::is_same::value) { + if (std::is_same::value && std::is_same::value && + std::is_same::value) { auto p_k = present_key.ptr(b_kv, h_group, pk); - auto _qk = dot_product_fp16(query.ptr(b, h_group), p_k, - S, p, p + 1, head_sum.ptr(b, h_group)); + auto _qk = dot_product_fp16(query.ptr(b, h_group), + p_k, + S, + p, + p + 1, + head_sum.ptr(b, h_group)); buf_attn_w.ptr(b, h_group, 0)[pk] = _qk; parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); continue; @@ -935,8 +950,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, #endif auto p_k = present_key.ptr(b_kv, h_group, pk); buf_attn_w.ptr(b, h_group, 0)[pk] = - dot_product(query.ptr(b, h_group), p_k, - S, p, p + 1, head_sum.ptr(b, h_group)); + dot_product(query.ptr(b, h_group), p_k, S, p, p + 1, head_sum.ptr(b, h_group)); parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); } } @@ -947,17 +961,25 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, auto p = past_k_scale_zp.ptr(pk, b_kv, h_group); for (size_t h = h_group * h_each_group_len; h < (h_group + 1) * h_each_group_len; h++) { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - if (std::is_same::value && std::is_same::value && std::is_same::value) { + if (std::is_same::value && std::is_same::value && + std::is_same::value) { auto p_k = present_key.ptr(b_kv, h_group, pk); - auto _qk = dot_product_fp16(query.ptr(b, h, pq), p_k, - S, p, p + 1, head_sum.ptr(b, h, pq)); + auto _qk = dot_product_fp16(query.ptr(b, h, pq), + p_k, + S, + p, + p + 1, + head_sum.ptr(b, h, pq)); buf_attn_w.ptr(b, h, pq)[pk] = _qk; continue; } #endif - buf_attn_w.ptr(b, h, pq)[pk] = - dot_product(query.ptr(b, h, pq), present_key.ptr(b_kv, h_group, pk), - S, p, p + 1, head_sum.ptr(b, h, pq)); + buf_attn_w.ptr(b, h, pq)[pk] = dot_product(query.ptr(b, h, pq), + present_key.ptr(b_kv, h_group, pk), + S, + p, + p + 1, + head_sum.ptr(b, h, pq)); } } parallel_it_step(pk, kv_len, b, B, h_group, h_group_num); @@ -1001,7 +1023,8 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, auto* v = present_value.ptr(b_kv, h_group, pv); auto p = past_v_scale_zp.ptr(pv, b_kv, h_group); for (size_t pq = 0; pq < q_len; pq++) { - for (size_t h = h_group * h_each_group_len, group_idx = 0; h < (h_group + 1) * h_each_group_len; h++, group_idx++) { + for (size_t h = h_group * h_each_group_len, group_idx = 0; h < (h_group + 1) * h_each_group_len; + h++, group_idx++) { attn_acc_value(buf_attn_score.ptr(ithr, pq, group_idx), buf_attn_w.ptr(b, h, pq)[pv], v, @@ -1014,7 +1037,7 @@ static void mha_single_token_kernel(const ov::intel_cpu::PlainTensor& query, // convert to dst for (size_t pq = 0; pq < q_len; pq++) { for (size_t h = h_group * h_each_group_len, group_idx = 0; h < (h_group + 1) * h_each_group_len; - h++, group_idx++) { + h++, group_idx++) { auto* dst = has_out_transpose ? output_emb.ptr(b, pq, h * SV) : output_emb.ptr(b, h, pq); cvt_copy(dst, buf_attn_score.ptr(ithr, pq, group_idx), SV); } diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp index e29e2bae0aa07a..2ef0f62d7e0df0 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/mha_single_token.hpp @@ -6,8 +6,9 @@ #include #include #include -#include #include +#include + #include "utils/plain_tensor.hpp" namespace ov { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp index 28755e69eaf589..c02f9770a37be9 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.cpp @@ -13,10 +13,10 @@ # include #endif +#include "common.hpp" #include "openvino/core/type/bfloat16.hpp" #include "softmax.hpp" #include "softmax_kernel.hpp" -#include "common.hpp" namespace ov { namespace Extensions { @@ -39,13 +39,33 @@ void attn_softmax(void* a, if (precision == ov::element::f16) { auto _a = reinterpret_cast(a); auto _alibi = reinterpret_cast(alibi); - attn_softmax_kernel(_a, a_dst, scale, _alibi, attn_mask, causal_mask, select_nfltmax_at_0, len, total_size, attn_mask_prec, dst_precision); + attn_softmax_kernel(_a, + a_dst, + scale, + _alibi, + attn_mask, + causal_mask, + select_nfltmax_at_0, + len, + total_size, + attn_mask_prec, + dst_precision); return; } #endif auto _a = reinterpret_cast(a); auto _alibi = reinterpret_cast(alibi); - attn_softmax_kernel(_a, a_dst, scale, _alibi, attn_mask, causal_mask, select_nfltmax_at_0, len, total_size, attn_mask_prec, dst_precision); + attn_softmax_kernel(_a, + a_dst, + scale, + _alibi, + attn_mask, + causal_mask, + select_nfltmax_at_0, + len, + total_size, + attn_mask_prec, + dst_precision); } } // namespace XARCH diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp index ee264924e8f256..d620a01e221788 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax.hpp @@ -6,8 +6,8 @@ #include #include #include -#include #include +#include namespace ov { namespace Extensions { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp index 60c6a24ec5f2fa..48b92b53fa2727 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/softmax_kernel.hpp @@ -3,16 +3,16 @@ // #pragma once -#include "common.hpp" -#include "openvino/core/type/element_type.hpp" - #include #include #include #include +#include "common.hpp" +#include "openvino/core/type/element_type.hpp" + #if defined(OPENVINO_ARCH_ARM64) -#include "arm_neon.h" +# include "arm_neon.h" #endif namespace ov { @@ -22,7 +22,7 @@ namespace XARCH { #if defined(HAVE_AVX2) inline void exp_ps_avx2(__m256& src) { -#define REPEAT8(x) x, x, x, x, x, x, x, x +# define REPEAT8(x) x, x, x, x, x, x, x, x static const uint32_t c_min[] = {REPEAT8(0xc2aeac50)}; static const uint32_t c_max[] = {REPEAT8(0x42b17218)}; static const uint32_t c_e[] = {REPEAT8(0x3fb8aa3b)}; @@ -36,21 +36,21 @@ inline void exp_ps_avx2(__m256& src) { static const uint32_t c_p4[] = {REPEAT8(0x3d2b9d0d)}; static const uint32_t c_p5[] = {REPEAT8(0x3c07cfce)}; static const uint32_t c_2[] = {REPEAT8(0x40000000)}; -#undef REPEAT8 +# undef REPEAT8 static constexpr int n_mantissa_bits = 23; - __m256 exp_ln_flt_min_f = _mm256_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) - __m256 exp_ln_flt_max_f = _mm256_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) - __m256 exp_log2ef = _mm256_loadu_ps(reinterpret_cast(c_e)); // log2(e) - __m256 half = _mm256_loadu_ps(reinterpret_cast(c_half)); // 0.5f - __m256 ln2f = _mm256_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) - __m256 one = _mm256_loadu_ps(reinterpret_cast(c_1)); // 1.0f - __m256i exponent_bias = _mm256_loadu_si256(reinterpret_cast(c_bias));// 127 - __m256 exp_pol1 = _mm256_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f - __m256 exp_pol2 = _mm256_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f - __m256 exp_pol3 = _mm256_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f - __m256 exp_pol4 = _mm256_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f - __m256 exp_pol5 = _mm256_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f - __m256 two = _mm256_loadu_ps(reinterpret_cast(c_2)); // 2 + __m256 exp_ln_flt_min_f = _mm256_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) + __m256 exp_ln_flt_max_f = _mm256_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) + __m256 exp_log2ef = _mm256_loadu_ps(reinterpret_cast(c_e)); // log2(e) + __m256 half = _mm256_loadu_ps(reinterpret_cast(c_half)); // 0.5f + __m256 ln2f = _mm256_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) + __m256 one = _mm256_loadu_ps(reinterpret_cast(c_1)); // 1.0f + __m256i exponent_bias = _mm256_loadu_si256(reinterpret_cast(c_bias)); // 127 + __m256 exp_pol1 = _mm256_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f + __m256 exp_pol2 = _mm256_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f + __m256 exp_pol3 = _mm256_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f + __m256 exp_pol4 = _mm256_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f + __m256 exp_pol5 = _mm256_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f + __m256 two = _mm256_loadu_ps(reinterpret_cast(c_2)); // 2 // exp(x) = // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression @@ -195,32 +195,33 @@ inline void scale_add2_reduce_max(float* a, // process vector body // unroll to avoid dependency caused by _mm256_max_ps for (; i + 4 * vec_len_f32_avx512 <= size; i += 4 * vec_len_f32_avx512) { - #define ITEM(n) \ - v_a = _mm512_loadu_ps(a + i + n * vec_len_f32_avx512); \ - v_a = _mm512_mul_ps(v_a, v_scale); \ - if (has_alibi) { \ - auto v_lookup = _mm512_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx512); \ - v_a = _mm512_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ - } \ - if (has_attn_mask) { \ - auto v_mask = mm512_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx512); \ - v_a = _mm512_add_ps(v_a, v_mask); \ - } \ - if (has_causal_mask) { \ - auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx512)); \ - auto v_maski32 = _mm512_cvtepi8_epi32(v_maski8); \ - auto kmask = _mm512_cmp_epi32_mask(v_maski32, v_zeroi32, _MM_CMPINT_NE); \ - kmask = _kxor_mask16(kmask, kmask_xor); \ - v_a = _mm512_mask_blend_ps(kmask, v_a, v_nfltmax); \ - } \ - v_max##n = _mm512_max_ps(v_max##n, v_a); \ +# define ITEM(n) \ + v_a = _mm512_loadu_ps(a + i + n * vec_len_f32_avx512); \ + v_a = _mm512_mul_ps(v_a, v_scale); \ + if (has_alibi) { \ + auto v_lookup = _mm512_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx512); \ + v_a = _mm512_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ + } \ + if (has_attn_mask) { \ + auto v_mask = mm512_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx512); \ + v_a = _mm512_add_ps(v_a, v_mask); \ + } \ + if (has_causal_mask) { \ + auto v_maski8 = \ + _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx512)); \ + auto v_maski32 = _mm512_cvtepi8_epi32(v_maski8); \ + auto kmask = _mm512_cmp_epi32_mask(v_maski32, v_zeroi32, _MM_CMPINT_NE); \ + kmask = _kxor_mask16(kmask, kmask_xor); \ + v_a = _mm512_mask_blend_ps(kmask, v_a, v_nfltmax); \ + } \ + v_max##n = _mm512_max_ps(v_max##n, v_a); \ _mm512_storeu_ps(a + i + n * vec_len_f32_avx512, v_a); ITEM(0); ITEM(1); ITEM(2); ITEM(3); - #undef ITEM +# undef ITEM } while (i + vec_len_f32_avx512 <= size) { v_a = _mm512_loadu_ps(a + i); @@ -295,32 +296,32 @@ inline void scale_add2_reduce_max(float* a, // process vector body // unroll to avoid dependency caused by _mm512_max_ps for (; i + 4 * vec_len_f32_avx2 <= size; i += 4 * vec_len_f32_avx2) { - #define ITEM(n) \ - v_a = _mm256_loadu_ps(a + i + n * vec_len_f32_avx2); \ - v_a = _mm256_mul_ps(v_a, v_scale); \ - if (has_alibi) { \ - auto v_lookup = _mm256_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx2); \ - v_a = _mm256_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ - } \ - if (has_attn_mask) { \ - auto v_mask = mm256_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx2); \ - v_a = _mm256_add_ps(v_a, v_mask); \ - } \ - if (has_causal_mask) { \ - auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx2)); \ - auto v_maski32 = _mm256_cvtepi8_epi32(v_maski8); \ - v_maski32 = _mm256_cmpeq_epi32(v_maski32, v_zeroi32);\ - v_maski32 = _mm256_xor_si256(v_maski32, v_mask_xor);\ - v_a = _mm256_blendv_ps(v_nfltmax, v_a, _mm256_castsi256_ps(v_maski32)); \ - } \ - v_max##n = _mm256_max_ps(v_max##n, v_a); \ +# define ITEM(n) \ + v_a = _mm256_loadu_ps(a + i + n * vec_len_f32_avx2); \ + v_a = _mm256_mul_ps(v_a, v_scale); \ + if (has_alibi) { \ + auto v_lookup = _mm256_loadu_ps(alibi_lookup + i + n * vec_len_f32_avx2); \ + v_a = _mm256_fmadd_ps(v_lookup, v_alibi_slope, v_a); \ + } \ + if (has_attn_mask) { \ + auto v_mask = mm256_uni_loadu_ps(attn_mask + i + n * vec_len_f32_avx2); \ + v_a = _mm256_add_ps(v_a, v_mask); \ + } \ + if (has_causal_mask) { \ + auto v_maski8 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(causal_mask + i + n * vec_len_f32_avx2)); \ + auto v_maski32 = _mm256_cvtepi8_epi32(v_maski8); \ + v_maski32 = _mm256_cmpeq_epi32(v_maski32, v_zeroi32); \ + v_maski32 = _mm256_xor_si256(v_maski32, v_mask_xor); \ + v_a = _mm256_blendv_ps(v_nfltmax, v_a, _mm256_castsi256_ps(v_maski32)); \ + } \ + v_max##n = _mm256_max_ps(v_max##n, v_a); \ _mm256_storeu_ps(a + i + n * vec_len_f32_avx2, v_a); ITEM(0); ITEM(1); ITEM(2); ITEM(3); - #undef ITEM +# undef ITEM } while (i + vec_len_f32_avx2 <= size) { @@ -415,7 +416,7 @@ inline void scale_add2_reduce_max(float* a, uint32x4_t v_maski32[2] = {v_maski32_low, v_maski32_high}; for (int j = 0; j < 2; ++j) { uint32x4_t kmask = vceqq_u32(v_maski32[j], v_zeroi32); // ==0 - v_a = vbslq_f32(kmask, v_nfltmax, v_a); // mask => -FLT_MAX + v_a = vbslq_f32(kmask, v_nfltmax, v_a); // mask => -FLT_MAX } } @@ -521,7 +522,7 @@ inline void scale_add2_reduce_max(ov::float16* a, #if defined(HAVE_AVX512F) static inline void exp_ps_avx512(__m512& src) { -#define REPEAT16(x) x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x +# define REPEAT16(x) x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x static const uint32_t c_min[] = {REPEAT16(0xc2aeac50)}; static const uint32_t c_max[] = {REPEAT16(0x42b17218)}; static const uint32_t c_e[] = {REPEAT16(0x3fb8aa3b)}; @@ -535,21 +536,21 @@ static inline void exp_ps_avx512(__m512& src) { static const uint32_t c_p4[] = {REPEAT16(0x3d2b9d0d)}; static const uint32_t c_p5[] = {REPEAT16(0x3c07cfce)}; static const uint32_t c_2[] = {REPEAT16(0x40000000)}; -#undef REPEAT16 +# undef REPEAT16 static constexpr int n_mantissa_bits = 23; - __m512 exp_ln_flt_min_f = _mm512_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) - __m512 exp_ln_flt_max_f = _mm512_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) - __m512 exp_log2ef = _mm512_loadu_ps(reinterpret_cast(c_e)); // log2(e) - __m512 half = _mm512_loadu_ps(reinterpret_cast(c_half)); // 0.5f - __m512 ln2f = _mm512_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) - __m512 one = _mm512_loadu_ps(reinterpret_cast(c_1)); // 1.0f - __m512i exponent_bias = _mm512_loadu_si512(c_bias); // 127 - __m512 exp_pol1 = _mm512_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f - __m512 exp_pol2 = _mm512_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f - __m512 exp_pol3 = _mm512_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f - __m512 exp_pol4 = _mm512_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f - __m512 exp_pol5 = _mm512_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f - __m512 two = _mm512_loadu_ps(reinterpret_cast(c_2)); // 2 + __m512 exp_ln_flt_min_f = _mm512_loadu_ps(reinterpret_cast(c_min)); // log(FLT_MIN) + __m512 exp_ln_flt_max_f = _mm512_loadu_ps(reinterpret_cast(c_max)); // log(FLT_MAX) + __m512 exp_log2ef = _mm512_loadu_ps(reinterpret_cast(c_e)); // log2(e) + __m512 half = _mm512_loadu_ps(reinterpret_cast(c_half)); // 0.5f + __m512 ln2f = _mm512_loadu_ps(reinterpret_cast(c_ln2)); // ln(2) + __m512 one = _mm512_loadu_ps(reinterpret_cast(c_1)); // 1.0f + __m512i exponent_bias = _mm512_loadu_si512(c_bias); // 127 + __m512 exp_pol1 = _mm512_loadu_ps(reinterpret_cast(c_p1)); // p1 = 0.999999701f + __m512 exp_pol2 = _mm512_loadu_ps(reinterpret_cast(c_p2)); // p2 = 0.499991506f + __m512 exp_pol3 = _mm512_loadu_ps(reinterpret_cast(c_p3)); // p3 = 0.166676521f + __m512 exp_pol4 = _mm512_loadu_ps(reinterpret_cast(c_p4)); // p4 = 0.0418978221f + __m512 exp_pol5 = _mm512_loadu_ps(reinterpret_cast(c_p5)); // p5 = 0.00828929059f + __m512 two = _mm512_loadu_ps(reinterpret_cast(c_2)); // 2 // exp(x) = // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression @@ -793,7 +794,9 @@ inline void multiply_scalar(float* a, float* a_dst, const float val, const size_ } } -template::value || std::is_same::value), bool>::type> +template ::value || std::is_same::value), bool>::type> inline void multiply_scalar(float* a, T* a_dst, const float val, const size_t size) { size_t i = 0; #if defined(HAVE_AVX512F) @@ -899,47 +902,68 @@ inline void attn_softmax_kernel(float* a, ov::element::Type attn_mask_prec, ov::element::Type dst_precision, float alibi_slope) { - using func_fp32_type = void (*)(float*, float, const float*, const float*, const uint8_t*, bool, size_t, float, float&); - using func_bf16_type = void (*)(float*, float, const float*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, float&); - using func_f16_type = void (*)(float*, float, const float*, const ov::float16*, const uint8_t*, bool, size_t, float, float&); - static constexpr func_fp32_type funcs_fp32[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_bf16_type funcs_bf16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_f16_type funcs_f16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; + using func_fp32_type = + void (*)(float*, float, const float*, const float*, const uint8_t*, bool, size_t, float, float&); + using func_bf16_type = + void (*)(float*, float, const float*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, float&); + using func_f16_type = + void (*)(float*, float, const float*, const ov::float16*, const uint8_t*, bool, size_t, float, float&); + static constexpr func_fp32_type funcs_fp32[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_bf16_type funcs_bf16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_f16_type funcs_f16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; int dispatch = (alibi ? 0b100 : 0) | (attn_mask ? 0b010 : 0) | (causal_mask ? 0b001 : 0); float max = std::numeric_limits::lowest(); if (attn_mask_prec == ov::element::f32) { - funcs_fp32[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_fp32[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else if (attn_mask_prec == ov::element::bf16) { - funcs_bf16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_bf16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else { - funcs_f16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_f16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } float sum = 0.0f; @@ -978,47 +1002,89 @@ inline void attn_softmax_kernel(ov::float16* a, ov::element::Type attn_mask_prec, ov::element::Type dst_precision, float alibi_slope) { - using func_fp32_type = void (*)(ov::float16*, float, const ov::float16*, const float*, const uint8_t*, bool, size_t, float, ov::float16&); - using func_bf16_type = void (*)(ov::float16*, float, const ov::float16*, const ov::bfloat16*, const uint8_t*, bool, size_t, float, ov::float16&); - using func_fp16_type = void (*)(ov::float16*, float, const ov::float16*, const ov::float16*, const uint8_t*, bool, size_t, float, ov::float16&); - static constexpr func_fp32_type funcs_fp32[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_bf16_type funcs_bf16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; - static constexpr func_fp16_type funcs_fp16[] = { - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max, - scale_add2_reduce_max - }; + using func_fp32_type = void (*)(ov::float16*, + float, + const ov::float16*, + const float*, + const uint8_t*, + bool, + size_t, + float, + ov::float16&); + using func_bf16_type = void (*)(ov::float16*, + float, + const ov::float16*, + const ov::bfloat16*, + const uint8_t*, + bool, + size_t, + float, + ov::float16&); + using func_fp16_type = void (*)(ov::float16*, + float, + const ov::float16*, + const ov::float16*, + const uint8_t*, + bool, + size_t, + float, + ov::float16&); + static constexpr func_fp32_type funcs_fp32[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_bf16_type funcs_bf16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; + static constexpr func_fp16_type funcs_fp16[] = {scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max, + scale_add2_reduce_max}; int dispatch = (alibi ? 0b100 : 0) | (attn_mask ? 0b010 : 0) | (causal_mask ? 0b001 : 0); ov::float16 max = std::numeric_limits::lowest(); if (attn_mask_prec == ov::element::f32) { - funcs_fp32[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_fp32[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else if (attn_mask_prec == ov::element::f16) { - funcs_fp16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_fp16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } else { - funcs_bf16[dispatch](a, scale, alibi, static_cast(attn_mask), causal_mask, select_nfltmax_at_0, len, alibi_slope, max); + funcs_bf16[dispatch](a, + scale, + alibi, + static_cast(attn_mask), + causal_mask, + select_nfltmax_at_0, + len, + alibi_slope, + max); } ov::float16 sum = 0.0f; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp index b719246e4976a1..93d7db55107951 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/transpose_kernel.hpp @@ -3,96 +3,108 @@ // #pragma once -#include "common.hpp" -#include "openvino/core/type/element_type.hpp" - #include #include #include #include +#include "common.hpp" +#include "openvino/core/type/element_type.hpp" + namespace ov { namespace Extensions { namespace Cpu { namespace XARCH { #if defined(HAVE_AVX512F) -inline void transpose_m512i_16x16(__m512i& r0, __m512i& r1, __m512i& r2, __m512i& r3, - __m512i& r4, __m512i& r5, __m512i& r6, __m512i& r7, - __m512i& r8, __m512i& r9, __m512i& ra, __m512i& rb, - __m512i& rc, __m512i& rd, __m512i& re, __m512i& rf) { +inline void transpose_m512i_16x16(__m512i& r0, + __m512i& r1, + __m512i& r2, + __m512i& r3, + __m512i& r4, + __m512i& r5, + __m512i& r6, + __m512i& r7, + __m512i& r8, + __m512i& r9, + __m512i& ra, + __m512i& rb, + __m512i& rc, + __m512i& rd, + __m512i& re, + __m512i& rf) { __m512i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf; - t0 = _mm512_unpacklo_epi32(r0, r1); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 - t1 = _mm512_unpackhi_epi32(r0, r1); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 - t2 = _mm512_unpacklo_epi32(r2, r3); // 32 48 33 49 ... - t3 = _mm512_unpackhi_epi32(r2, r3); // 34 50 35 51 ... - t4 = _mm512_unpacklo_epi32(r4, r5); // 64 80 65 81 ... - t5 = _mm512_unpackhi_epi32(r4, r5); // 66 82 67 83 ... - t6 = _mm512_unpacklo_epi32(r6, r7); // 96 112 97 113 ... - t7 = _mm512_unpackhi_epi32(r6, r7); // 98 114 99 115 ... - t8 = _mm512_unpacklo_epi32(r8, r9); // 128 ... - t9 = _mm512_unpackhi_epi32(r8, r9); // 130 ... - ta = _mm512_unpacklo_epi32(ra, rb); // 160 ... - tb = _mm512_unpackhi_epi32(ra, rb); // 162 ... - tc = _mm512_unpacklo_epi32(rc, rd); // 196 ... - td = _mm512_unpackhi_epi32(rc, rd); // 198 ... - te = _mm512_unpacklo_epi32(re, rf); // 228 ... - tf = _mm512_unpackhi_epi32(re, rf); // 230 ... - - r0 = _mm512_unpacklo_epi64(t0, t2); // 0 16 32 48 ... - r1 = _mm512_unpackhi_epi64(t0, t2); // 1 17 33 49 ... - r2 = _mm512_unpacklo_epi64(t1, t3); // 2 18 34 49 ... - r3 = _mm512_unpackhi_epi64(t1, t3); // 3 19 35 51 ... - r4 = _mm512_unpacklo_epi64(t4, t6); // 64 80 96 112 ... - r5 = _mm512_unpackhi_epi64(t4, t6); // 65 81 97 114 ... - r6 = _mm512_unpacklo_epi64(t5, t7); // 66 82 98 113 ... - r7 = _mm512_unpackhi_epi64(t5, t7); // 67 83 99 115 ... - r8 = _mm512_unpacklo_epi64(t8, ta); // 128 144 160 176 ... - r9 = _mm512_unpackhi_epi64(t8, ta); // 129 145 161 178 ... - ra = _mm512_unpacklo_epi64(t9, tb); // 130 146 162 177 ... - rb = _mm512_unpackhi_epi64(t9, tb); // 131 147 163 179 ... - rc = _mm512_unpacklo_epi64(tc, te); // 192 208 228 240 ... - rd = _mm512_unpackhi_epi64(tc, te); // 193 209 229 241 ... - re = _mm512_unpacklo_epi64(td, tf); // 194 210 230 242 ... - rf = _mm512_unpackhi_epi64(td, tf); // 195 211 231 243 ... - - t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); // 0 16 32 48 8 24 40 56 64 80 96 112 ... - t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); // 1 17 33 49 ... - t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); // 2 18 34 50 ... - t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); // 3 19 35 51 ... - t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); // 4 20 36 52 ... - t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); // 5 21 37 53 ... - t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); // 6 22 38 54 ... - t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); // 7 23 39 55 ... - t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); // 128 144 160 176 ... - t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); // 129 145 161 177 ... - ta = _mm512_shuffle_i32x4(ra, re, 0x88); // 130 146 162 178 ... - tb = _mm512_shuffle_i32x4(rb, rf, 0x88); // 131 147 163 179 ... - tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); // 132 148 164 180 ... - td = _mm512_shuffle_i32x4(r9, rd, 0xdd); // 133 149 165 181 ... - te = _mm512_shuffle_i32x4(ra, re, 0xdd); // 134 150 166 182 ... - tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); // 135 151 167 183 ... - - r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); // 0 16 32 48 64 80 96 112 ... 240 - r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); // 1 17 33 49 66 81 97 113 ... 241 - r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); // 2 18 34 50 67 82 98 114 ... 242 - r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); // 3 19 35 51 68 83 99 115 ... 243 - r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); // 4 ... - r5 = _mm512_shuffle_i32x4(t5, td, 0x88); // 5 ... - r6 = _mm512_shuffle_i32x4(t6, te, 0x88); // 6 ... - r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); // 7 ... - r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); // 8 ... - r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); // 9 ... - ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); // 10 ... - rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); // 11 ... - rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); // 12 ... - rd = _mm512_shuffle_i32x4(t5, td, 0xdd); // 13 ... - re = _mm512_shuffle_i32x4(t6, te, 0xdd); // 14 ... - rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); // 15 31 47 63 79 96 111 127 ... 255 + t0 = _mm512_unpacklo_epi32(r0, r1); // 0 16 1 17 4 20 5 21 8 24 9 25 12 28 13 29 + t1 = _mm512_unpackhi_epi32(r0, r1); // 2 18 3 19 6 22 7 23 10 26 11 27 14 30 15 31 + t2 = _mm512_unpacklo_epi32(r2, r3); // 32 48 33 49 ... + t3 = _mm512_unpackhi_epi32(r2, r3); // 34 50 35 51 ... + t4 = _mm512_unpacklo_epi32(r4, r5); // 64 80 65 81 ... + t5 = _mm512_unpackhi_epi32(r4, r5); // 66 82 67 83 ... + t6 = _mm512_unpacklo_epi32(r6, r7); // 96 112 97 113 ... + t7 = _mm512_unpackhi_epi32(r6, r7); // 98 114 99 115 ... + t8 = _mm512_unpacklo_epi32(r8, r9); // 128 ... + t9 = _mm512_unpackhi_epi32(r8, r9); // 130 ... + ta = _mm512_unpacklo_epi32(ra, rb); // 160 ... + tb = _mm512_unpackhi_epi32(ra, rb); // 162 ... + tc = _mm512_unpacklo_epi32(rc, rd); // 196 ... + td = _mm512_unpackhi_epi32(rc, rd); // 198 ... + te = _mm512_unpacklo_epi32(re, rf); // 228 ... + tf = _mm512_unpackhi_epi32(re, rf); // 230 ... + + r0 = _mm512_unpacklo_epi64(t0, t2); // 0 16 32 48 ... + r1 = _mm512_unpackhi_epi64(t0, t2); // 1 17 33 49 ... + r2 = _mm512_unpacklo_epi64(t1, t3); // 2 18 34 49 ... + r3 = _mm512_unpackhi_epi64(t1, t3); // 3 19 35 51 ... + r4 = _mm512_unpacklo_epi64(t4, t6); // 64 80 96 112 ... + r5 = _mm512_unpackhi_epi64(t4, t6); // 65 81 97 114 ... + r6 = _mm512_unpacklo_epi64(t5, t7); // 66 82 98 113 ... + r7 = _mm512_unpackhi_epi64(t5, t7); // 67 83 99 115 ... + r8 = _mm512_unpacklo_epi64(t8, ta); // 128 144 160 176 ... + r9 = _mm512_unpackhi_epi64(t8, ta); // 129 145 161 178 ... + ra = _mm512_unpacklo_epi64(t9, tb); // 130 146 162 177 ... + rb = _mm512_unpackhi_epi64(t9, tb); // 131 147 163 179 ... + rc = _mm512_unpacklo_epi64(tc, te); // 192 208 228 240 ... + rd = _mm512_unpackhi_epi64(tc, te); // 193 209 229 241 ... + re = _mm512_unpacklo_epi64(td, tf); // 194 210 230 242 ... + rf = _mm512_unpackhi_epi64(td, tf); // 195 211 231 243 ... + + t0 = _mm512_shuffle_i32x4(r0, r4, 0x88); // 0 16 32 48 8 24 40 56 64 80 96 112 ... + t1 = _mm512_shuffle_i32x4(r1, r5, 0x88); // 1 17 33 49 ... + t2 = _mm512_shuffle_i32x4(r2, r6, 0x88); // 2 18 34 50 ... + t3 = _mm512_shuffle_i32x4(r3, r7, 0x88); // 3 19 35 51 ... + t4 = _mm512_shuffle_i32x4(r0, r4, 0xdd); // 4 20 36 52 ... + t5 = _mm512_shuffle_i32x4(r1, r5, 0xdd); // 5 21 37 53 ... + t6 = _mm512_shuffle_i32x4(r2, r6, 0xdd); // 6 22 38 54 ... + t7 = _mm512_shuffle_i32x4(r3, r7, 0xdd); // 7 23 39 55 ... + t8 = _mm512_shuffle_i32x4(r8, rc, 0x88); // 128 144 160 176 ... + t9 = _mm512_shuffle_i32x4(r9, rd, 0x88); // 129 145 161 177 ... + ta = _mm512_shuffle_i32x4(ra, re, 0x88); // 130 146 162 178 ... + tb = _mm512_shuffle_i32x4(rb, rf, 0x88); // 131 147 163 179 ... + tc = _mm512_shuffle_i32x4(r8, rc, 0xdd); // 132 148 164 180 ... + td = _mm512_shuffle_i32x4(r9, rd, 0xdd); // 133 149 165 181 ... + te = _mm512_shuffle_i32x4(ra, re, 0xdd); // 134 150 166 182 ... + tf = _mm512_shuffle_i32x4(rb, rf, 0xdd); // 135 151 167 183 ... + + r0 = _mm512_shuffle_i32x4(t0, t8, 0x88); // 0 16 32 48 64 80 96 112 ... 240 + r1 = _mm512_shuffle_i32x4(t1, t9, 0x88); // 1 17 33 49 66 81 97 113 ... 241 + r2 = _mm512_shuffle_i32x4(t2, ta, 0x88); // 2 18 34 50 67 82 98 114 ... 242 + r3 = _mm512_shuffle_i32x4(t3, tb, 0x88); // 3 19 35 51 68 83 99 115 ... 243 + r4 = _mm512_shuffle_i32x4(t4, tc, 0x88); // 4 ... + r5 = _mm512_shuffle_i32x4(t5, td, 0x88); // 5 ... + r6 = _mm512_shuffle_i32x4(t6, te, 0x88); // 6 ... + r7 = _mm512_shuffle_i32x4(t7, tf, 0x88); // 7 ... + r8 = _mm512_shuffle_i32x4(t0, t8, 0xdd); // 8 ... + r9 = _mm512_shuffle_i32x4(t1, t9, 0xdd); // 9 ... + ra = _mm512_shuffle_i32x4(t2, ta, 0xdd); // 10 ... + rb = _mm512_shuffle_i32x4(t3, tb, 0xdd); // 11 ... + rc = _mm512_shuffle_i32x4(t4, tc, 0xdd); // 12 ... + rd = _mm512_shuffle_i32x4(t5, td, 0xdd); // 13 ... + re = _mm512_shuffle_i32x4(t6, te, 0xdd); // 14 ... + rf = _mm512_shuffle_i32x4(t7, tf, 0xdd); // 15 31 47 63 79 96 111 127 ... 255 } -template +template inline void transpose_16x16_kernel(float* _dst, T* src, size_t dst_stride, size_t src_stride) { auto* dst = reinterpret_cast(_dst); __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; @@ -133,7 +145,7 @@ inline void transpose_16x16_kernel(float* _dst, T* src, size_t dst_stride, size_ _mm512_storeu_si512(dst + 15 * dst_stride, rf); } -template +template inline void transpose_16xK_kernel(float* _dst, T* src, size_t K, size_t dst_stride, size_t src_stride) { auto* dst = reinterpret_cast(_dst); __m512i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf; @@ -156,24 +168,110 @@ inline void transpose_16xK_kernel(float* _dst, T* src, size_t K, size_t dst_stri transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf); -#define S(m) _mm512_storeu_si512(dst + 0x##m * dst_stride, r##m) -#define S8() S(0); S(1); S(2); S(3); S(4); S(5); S(6); S(7); +# define S(m) _mm512_storeu_si512(dst + 0x##m * dst_stride, r##m) +# define S8() \ + S(0); \ + S(1); \ + S(2); \ + S(3); \ + S(4); \ + S(5); \ + S(6); \ + S(7); switch (K) { - case 8: S8(); break; - case 9: S8() S(8); break; - case 10: S8(); S(8); S(9); break; - case 11: S8(); S(8); S(9); S(a); break; - case 12: S8(); S(8); S(9); S(a); S(b); break; - case 13: S8(); S(8); S(9); S(a); S(b); S(c); break; - case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break; - case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break; - case 1: S(0); break; - case 2: S(0); S(1); break; - case 3: S(0); S(1); S(2); break; - case 4: S(0); S(1); S(2); S(3); break; - case 5: S(0); S(1); S(2); S(3); S(4); break; - case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; - case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + case 8: + S8(); + break; + case 9: + S8() S(8); + break; + case 10: + S8(); + S(8); + S(9); + break; + case 11: + S8(); + S(8); + S(9); + S(a); + break; + case 12: + S8(); + S(8); + S(9); + S(a); + S(b); + break; + case 13: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + break; + case 14: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + break; + case 15: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + S(e); + break; + case 1: + S(0); + break; + case 2: + S(0); + S(1); + break; + case 3: + S(0); + S(1); + S(2); + break; + case 4: + S(0); + S(1); + S(2); + S(3); + break; + case 5: + S(0); + S(1); + S(2); + S(3); + S(4); + break; + case 6: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + break; + case 7: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + S(6); + break; } } @@ -240,30 +338,109 @@ inline void transpose_16xK_kernel(uint32_t* dst, uint32_t* src, size_t K, size_t transpose_m512i_16x16(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ra, rb, rc, rd, re, rf); switch (K) { - case 8: S8(); break; - case 9: S8() S(8); break; - case 10: S8(); S(8); S(9); break; - case 11: S8(); S(8); S(9); S(a); break; - case 12: S8(); S(8); S(9); S(a); S(b); break; - case 13: S8(); S(8); S(9); S(a); S(b); S(c); break; - case 14: S8(); S(8); S(9); S(a); S(b); S(c); S(d); break; - case 15: S8(); S(8); S(9); S(a); S(b); S(c); S(d); S(e); break; - case 1: S(0); break; - case 2: S(0); S(1); break; - case 3: S(0); S(1); S(2); break; - case 4: S(0); S(1); S(2); S(3); break; - case 5: S(0); S(1); S(2); S(3); S(4); break; - case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; - case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + case 8: + S8(); + break; + case 9: + S8() S(8); + break; + case 10: + S8(); + S(8); + S(9); + break; + case 11: + S8(); + S(8); + S(9); + S(a); + break; + case 12: + S8(); + S(8); + S(9); + S(a); + S(b); + break; + case 13: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + break; + case 14: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + break; + case 15: + S8(); + S(8); + S(9); + S(a); + S(b); + S(c); + S(d); + S(e); + break; + case 1: + S(0); + break; + case 2: + S(0); + S(1); + break; + case 3: + S(0); + S(1); + S(2); + break; + case 4: + S(0); + S(1); + S(2); + S(3); + break; + case 5: + S(0); + S(1); + S(2); + S(3); + S(4); + break; + case 6: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + break; + case 7: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + S(6); + break; } -#undef S -#undef S8 +# undef S +# undef S8 } #elif defined(HAVE_AVX2) // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2 -inline void transpose_8x8(__m256& r0, __m256& r1, __m256& r2, __m256& r3, __m256& r4, __m256& r5, __m256& r6, __m256& r7) { +inline void +transpose_8x8(__m256& r0, __m256& r1, __m256& r2, __m256& r3, __m256& r4, __m256& r5, __m256& r6, __m256& r7) { __m256 t0, t1, t2, t3, t4, t5, t6, t7; __m256 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; t0 = _mm256_unpacklo_ps(r0, r1); @@ -292,7 +469,7 @@ inline void transpose_8x8(__m256& r0, __m256& r1, __m256& r2, __m256& r3, __m256 r7 = _mm256_permute2f128_ps(tt3, tt7, 0x31); } -template +template inline void transpose_16x16_kernel(float* dst, T* src, size_t dst_stride, size_t src_stride) { __m256 r0, r1, r2, r3, r4, r5, r6, r7; @@ -323,7 +500,7 @@ inline void transpose_16x16_kernel(float* dst, T* src, size_t dst_stride, size_t } } -template +template inline void transpose_16xK_kernel(float* dst, T* src, size_t K, size_t dst_stride, size_t src_stride) { __m256 r0, r1, r2, r3, r4, r5, r6, r7; @@ -366,24 +543,59 @@ inline void transpose_16xK_kernel(float* dst, T* src, size_t K, size_t dst_strid transpose_8x8(r0, r1, r2, r3, r4, r5, r6, r7); -#define S(m) _mm256_storeu_ps(dst + j + m * dst_stride, r##m) +# define S(m) _mm256_storeu_ps(dst + j + m * dst_stride, r##m) switch (K) { - case 1: S(0); break; - case 2: S(0); S(1); break; - case 3: S(0); S(1); S(2); break; - case 4: S(0); S(1); S(2); S(3); break; - case 5: S(0); S(1); S(2); S(3); S(4); break; - case 6: S(0); S(1); S(2); S(3); S(4); S(5); break; - case 7: S(0); S(1); S(2); S(3); S(4); S(5); S(6); break; + case 1: + S(0); + break; + case 2: + S(0); + S(1); + break; + case 3: + S(0); + S(1); + S(2); + break; + case 4: + S(0); + S(1); + S(2); + S(3); + break; + case 5: + S(0); + S(1); + S(2); + S(3); + S(4); + break; + case 6: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + break; + case 7: + S(0); + S(1); + S(2); + S(3); + S(4); + S(5); + S(6); + break; } -#undef S +# undef S } } } #else -template +template inline void transpose_16x16_kernel(TDST* dst, TSRC* src, size_t dst_stride, size_t src_stride) { for (size_t i = 0; i < 16; i++) { for (size_t j = 0; j < 16; j++) { @@ -392,7 +604,7 @@ inline void transpose_16x16_kernel(TDST* dst, TSRC* src, size_t dst_stride, size } } -template +template inline void transpose_16xK_kernel(TDST* dst, TSRC* src, size_t K, size_t dst_stride, size_t src_stride) { for (size_t i = 0; i < K; i++) { for (size_t j = 0; j < 16; j++) { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp index 2895a272b982b5..7df2e2371a843a 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/brgemm_kernel.cpp @@ -4,11 +4,12 @@ #include "brgemm_kernel.hpp" -#include "dnnl_extension_utils.h" -#include "utils/cpu_utils.hpp" #include #include +#include "dnnl_extension_utils.h" +#include "utils/cpu_utils.hpp" + using namespace dnnl::impl::cpu::x64; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64::matmul; @@ -100,8 +101,9 @@ BrgemmKernel::BrgemmKernel(size_t M, brgemmCtx.M = M_; brgemmCtx.N = N_; brgemmCtx.K = K_; - brgemmCtx.LDA = k ? K_blk : (is_avx_f16_only ? K : lda); // f16 use f32 internally - brgemmCtx.LDB = (!is_f32 || b_transposed) ? rnd_up(N, N_blk) : ldb; // bf16/fp16/b_transposed needs copy + brgemmCtx.LDA = k ? K_blk : (is_avx_f16_only ? K : lda); // f16 use f32 internally + brgemmCtx.LDB = + (!is_f32 || b_transposed) ? rnd_up(N, N_blk) : ldb; // bf16/fp16/b_transposed needs copy brgemmCtx.LDC = ldc; brgemmCtx.dt_in0 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(srcType)); brgemmCtx.dt_in1 = static_cast(DnnlExtensionUtils::ElementTypeToDataType(weiType)); @@ -158,8 +160,8 @@ const size_t BrgemmKernel::get_scratch_b_size() const { } void BrgemmKernel::init_brgemm(brgemmCtx& ctx, - std::unique_ptr& brgKernel, - bool use_amx) { + std::unique_ptr& brgKernel, + bool use_amx) { brgemm_desc_t brgDesc; const bool is_int8 = @@ -208,7 +210,8 @@ void BrgemmKernel::init_brgemm(brgemmCtx& ctx, brgattr.max_bs = 1; brgattr.wary_tail_read = false; brgattr.hint_innermost_loop = brgemm_innermost_undef; - // if b_accumulate is true, it means we want c+=a*b. jit_brgemm_amx_uker_base_t::load_accumulators can support this using tileload(c) without postops + // if b_accumulate is true, it means we want c+=a*b. jit_brgemm_amx_uker_base_t::load_accumulators can support + // this using tileload(c) without postops brgattr.use_uker = true; brgattr.use_interleave_stores = true; brgattr.hint_prefetching = brgemm_kernel_prefetching_t::brgemm_prf1; @@ -248,7 +251,7 @@ void BrgemmKernel::init_brgemm_copy_a( brgCopyKernelConf.K_tail = K_tail; brgCopyKernelConf.K_blk = K_blk; brgCopyKernelConf.use_buffer_a_tail_only = false; - //padding K tail to K_blk, LDA is the stride for target tensor + // padding K tail to K_blk, LDA is the stride for target tensor brgCopyKernelConf.LDA = LDA; brgCopyKernelConf.has_zero_point_b = false; brgCopyKernelConf.s8s8_compensation_required = false; @@ -258,9 +261,13 @@ void BrgemmKernel::init_brgemm_copy_a( brgCopyKernelConf.copy_A_src_stride = copy_A_src_stride; // copy_a_kernel assumes that in/out tensor has same data type except f16 // copy_a_kernel has special path for f16: assuming input(f16) -> output(f32) - brgCopyKernelConf.a_dt_sz = is_avx_f16_only ? sizeof(ov::float16) : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + brgCopyKernelConf.a_dt_sz = is_avx_f16_only + ? sizeof(ov::float16) + : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); // copied A has the same precision of original - brgCopyKernelConf.tr_a_dt_sz = is_avx_f16_only ? sizeof(float) : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); + brgCopyKernelConf.tr_a_dt_sz = + is_avx_f16_only ? sizeof(float) + : DnnlExtensionUtils::sizeOfDataType(static_cast(dt_in0)); brgCopyKernelConf.transposed_A = transpose; brgCopyKernelConf.isa = is_avx_f16_only ? avx512_core_fp16 : avx512_core_amx; @@ -284,7 +291,7 @@ void BrgemmKernel::init_brgemm_copy_b( brgCopyKernelConf.wei_dt = is_avx_f16_only ? dnnl_data_type_t::dnnl_f32 : dt_in1; brgCopyKernelConf.orig_wei_dt = dt_in1; brgCopyKernelConf.wei_n_blk = N_blk; - brgCopyKernelConf.wei_tag = transpose ? dnnl_ba : dnnl_ab; + brgCopyKernelConf.wei_tag = transpose ? dnnl_ba : dnnl_ab; brgCopyKernelConf.copy_B_wei_stride = copy_B_wei_stride; brgCopyKernelConf.transposed_B = transpose; @@ -298,10 +305,14 @@ void BrgemmKernel::init_brgemm_copy_b( brgCopyKernelConf.K_tail = 0; brgCopyKernelConf.N_chunk_elems = brgCopyKernelConf.N_blk; // f16 is computed by upconverting. in(f16) -> out(f32) - brgCopyKernelConf.b_dt_sz = is_avx_f16_only ? sizeof(ov::float16) : - DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); - brgCopyKernelConf.tr_b_dt_sz = is_avx_f16_only ? sizeof(float) : - DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); + brgCopyKernelConf.b_dt_sz = + is_avx_f16_only + ? sizeof(ov::float16) + : DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); + brgCopyKernelConf.tr_b_dt_sz = + is_avx_f16_only + ? sizeof(float) + : DnnlExtensionUtils::sizeOfDataType(static_cast(brgCopyKernelConf.src_dt)); brgCopyKernelConf.req_wei_vnni_downconvert = false; if (is_with_amx) { @@ -390,12 +401,7 @@ void BrgemmKernel::executeGemm(bool is_M_tail, void* a, void* b, void* c, void* auto weight_ptr = ptr_scartch_b + B_stride; auto C_stride = n * count_N * ov::element::f32.size(); auto out_ptr = ptr_C + C_stride; - callBrgemm(brgemmCtx, - brgKernels[getBrgIdx(mIdx, k, n)], - local_a_ptr, - weight_ptr, - out_ptr, - wsp); + callBrgemm(brgemmCtx, brgKernels[getBrgIdx(mIdx, k, n)], local_a_ptr, weight_ptr, out_ptr, wsp); // stride K, N if body kernel is executed. if (k == 0) { count_K = brgemmCtx.K * brgemmCtx.LDB; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp index f8b0df611258a7..1d5e81410a0bf3 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.cpp @@ -4,7 +4,6 @@ #include "dft_uni_kernel.hpp" - using namespace dnnl::impl; using namespace dnnl::impl::utils; using namespace dnnl::impl::cpu::x64; @@ -16,7 +15,8 @@ namespace ov { namespace intel_cpu { template -jit_uni_dft_kernel_f32::jit_uni_dft_kernel_f32() : jit_uni_dft_kernel(), jit_generator(jit_name()) {} +jit_uni_dft_kernel_f32::jit_uni_dft_kernel_f32() : jit_uni_dft_kernel(), + jit_generator(jit_name()) {} template void jit_uni_dft_kernel_f32::create_ker() { @@ -115,11 +115,9 @@ template struct jit_uni_dft_kernel_f32; template struct jit_uni_dft_kernel_f32; template struct jit_uni_dft_kernel_f32; - template -jit_uni_fft_kernel_f32::jit_uni_fft_kernel_f32() - : jit_uni_fft_kernel(), - jit_generator(jit_name()) {} +jit_uni_fft_kernel_f32::jit_uni_fft_kernel_f32() : jit_uni_fft_kernel(), + jit_generator(jit_name()) {} template void jit_uni_fft_kernel_f32::create_ker() { diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp index b70c99e5f8a527..095a3db97d2a64 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/dft_uni_kernel.hpp @@ -130,7 +130,6 @@ struct jit_uni_fft_kernel_f32 : public jit_uni_fft_kernel, public dnnl::impl::cp Vmm vmm_data_result = vmm_data_odd_2; - template void loop_process(int step); @@ -138,5 +137,5 @@ struct jit_uni_fft_kernel_f32 : public jit_uni_fft_kernel, public dnnl::impl::cp void move_data(const Xbyak::Xmm& x, const Xbyak::Address& addr, int count); }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp index 5aaefb086f119c..c0de6520b7099c 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.cpp @@ -3,6 +3,7 @@ // #include "gather_uni_kernel.hpp" + #include "openvino/core/except.hpp" using namespace dnnl::impl::cpu; @@ -10,23 +11,52 @@ using namespace dnnl::impl::cpu; namespace ov { namespace intel_cpu { -const unsigned jitGatherKernelBase::shufMask8bitUni[16] = {0x0C080400, 0x80808080, 0x80808080, 0x80808080, 0x0C080400, 0x80808080, 0x80808080, 0x80808080, - 0x0C080400, 0x80808080, 0x80808080, 0x80808080, 0x0C080400, 0x80808080, 0x80808080, 0x80808080}; -const unsigned jitGatherKernelBase::permMask8bitA2[8] = {0, 4, 1, 5, 2, 6, 3, 7}; -const unsigned jitGatherKernelBase::permMask8bitA5[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; - -const unsigned jitGatherKernelBase::shufMask16bitUni[16] = {0x05040100, 0x0D0C0908, 0x80808080, 0x80808080, 0x05040100, 0x0D0C0908, 0x80808080, 0x80808080, - 0x05040100, 0x0D0C0908, 0x80808080, 0x80808080, 0x05040100, 0x0D0C0908, 0x80808080, 0x80808080}; -const unsigned jitGatherKernelBase::permMask16bitA2[8] = {0, 1, 4, 5, 2, 3, 6, 7}; -const unsigned jitGatherKernelBase::permMask16bitA5[16] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; +const unsigned jitGatherKernelBase::shufMask8bitUni[16] = {0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080, + 0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080, + 0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080, + 0x0C080400, + 0x80808080, + 0x80808080, + 0x80808080}; +const unsigned jitGatherKernelBase::permMask8bitA2[8] = {0, 4, 1, 5, 2, 6, 3, 7}; +const unsigned jitGatherKernelBase::permMask8bitA5[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; + +const unsigned jitGatherKernelBase::shufMask16bitUni[16] = {0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080, + 0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080, + 0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080, + 0x05040100, + 0x0D0C0908, + 0x80808080, + 0x80808080}; +const unsigned jitGatherKernelBase::permMask16bitA2[8] = {0, 1, 4, 5, 2, 3, 6, 7}; +const unsigned jitGatherKernelBase::permMask16bitA5[16] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; const unsigned jitGatherKernelBase::incVec[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; #define GET_OFF(field) offsetof(gatherJitExecArgs, field) template -jitUniGatherKernel::jitUniGatherKernel(const jGatherConfParams& jcp) : - jitGatherKernelBase(jcp), x64::jit_generator(jit_name()) { +jitUniGatherKernel::jitUniGatherKernel(const jGatherConfParams& jcp) + : jitGatherKernelBase(jcp), + x64::jit_generator(jit_name()) { vlen = x64::cpu_isa_traits::vlen; dataElPerVec = vlen / jcp.dataTypeSize; idxElPerVec = vlen / indicesTypeSize; @@ -74,7 +104,7 @@ void jitUniGatherKernel::generate() { if (!jcp.dynamicShapes) { mov(regAux1, ptr[regParams + GET_OFF(specIndicesSize)]); uni_vpbroadcastd(vmmSpecIdxSizeB, ptr[regAux1]); - uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. mov(regAux1, ptr[regParams + GET_OFF(specIdxB)]); uni_vmovups(vmmSpecIdxB, ptr[regAux1]); @@ -84,7 +114,7 @@ void jitUniGatherKernel::generate() { uni_vmovups(vmmSrcBeforeAxisSumB, ptr[regAux1]); } - if (jcp.afterAxisSize == 1lu) { // Elementwise case. + if (jcp.afterAxisSize == 1lu) { // Elementwise case. uni_vmovd(reg32SpecIdxSizeB, xmmSpecIdxSizeB); if (jcp.beforeAxisSize != 1lu) { mov(regAux1, ptr[regParams + GET_OFF(axisAndAfterAxisSizeB)]); @@ -98,8 +128,9 @@ void jitUniGatherKernel::generate() { mov(regBetweenBatchAndAxisSize, ptr[regAux1]); mov(regBetweenBatchAndAxisIter, ptr[regParams + GET_OFF(betweenBatchAndAxisIter)]); - if (jcp.specIdxSize < idxElPerVec) { // Short case. - if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && jcp.specIdxSize != 16) { + if (jcp.specIdxSize < idxElPerVec) { // Short case. + if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && + jcp.specIdxSize != 16) { mov(regAux1, ptr[regParams + GET_OFF(permIdxMask)]); uni_vmovups(vmmPermIdxMask, ptr[regAux1]); } @@ -107,7 +138,7 @@ void jitUniGatherKernel::generate() { mov(regAux1, ptr[regParams + GET_OFF(beforeAxisDiff)]); uni_vmovups(vmmBeforeAxDiffB, ptr[regAux1]); if (jcp.dataTypeSize != 1) - uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size + uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size } if (jcp.batchDims > 0lu) { mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); @@ -115,14 +146,14 @@ void jitUniGatherKernel::generate() { } process(true, false); - } else { // Long case. + } else { // Long case. uni_vmovd(reg32IdxIter, xmmSpecIdxB); fillVlenVector(); process(false, false); } - } else { // Blocked case. - if (jcp.afterAxisSize <= idxElPerVec) { // Short case. + } else { // Blocked case. + if (jcp.afterAxisSize <= idxElPerVec) { // Short case. mov(regAux1, ptr[regParams + GET_OFF(afterAxIdxB)]); uni_vmovups(vmmAfterAxisIdxB, ptr[regAux1]); mov(regAux1, ptr[regParams + GET_OFF(afterAxisPermMask)]); @@ -146,18 +177,19 @@ void jitUniGatherKernel::generate() { } const uint64_t specIdxAndAfterAxisSize = jcp.specIdxSize * jcp.afterAxisSize; if (specIdxAndAfterAxisSize != 1 && specIdxAndAfterAxisSize != 2 && specIdxAndAfterAxisSize != 4 && - specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) { + specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) { mov(regAux1, ptr[regParams + GET_OFF(beforeAxisPermMask)]); uni_vmovups(vmmBeforeAxPermMask, ptr[regAux1]); } } process(true, true); - } else { // Long case. - OPENVINO_THROW("Gather kernel does not support static shape with after axis size greater than elements in vector."); + } else { // Long case. + OPENVINO_THROW("Gather kernel does not support static shape with after axis size greater than elements " + "in vector."); } } - } else { // Dynamic shapes. + } else { // Dynamic shapes. mov(regAux1, ptr[regParams + GET_OFF(start)]); uni_vpbroadcastd(vmmSpecIdxB, ptr[regAux1]); mov(regAux1, reinterpret_cast(incVec)); @@ -172,8 +204,8 @@ void jitUniGatherKernel::generate() { uni_vroundps(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, 0x1); uni_vfnmadd231ps(vmmSpecIdxB, vmmSrcBeforeAxisSumB, vAux1); uni_vcvtps2dq(vmmSpecIdxB, vmmSpecIdxB); - uni_vpslld(vmmSpecIdxB, vmmSpecIdxB, idxTypeShift); // multiply by indices type size. - uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmSpecIdxB, vmmSpecIdxB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmSpecIdxSizeB, vmmSpecIdxSizeB, idxTypeShift); // multiply by indices type size. uni_vmovd(reg32SpecIdxSizeB, xmmSpecIdxSizeB); mov(regAux1, ptr[regParams + GET_OFF(betweenBatchAndAxisSize)]); @@ -189,7 +221,8 @@ void jitUniGatherKernel::generate() { mov(regAux1, ptr[regParams + GET_OFF(axisAndAfterAxisSizeB)]); uni_vpbroadcastd(vmmAxisAndAfterAxisSizeB, ptr[regAux1]); - // Formula: srcBeforeAxisSum = ((start / specIndicesSize) % betweenBatchAndAxis) * axisAndAfterAxisSize + srcAfterBatchSize * idxBatchSum + // Formula: srcBeforeAxisSum = ((start / specIndicesSize) % betweenBatchAndAxis) * axisAndAfterAxisSize + + // srcAfterBatchSize * idxBatchSum if (jcp.beforeAxisSize != 1lu) { uni_vpmulld(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); @@ -210,28 +243,29 @@ void jitUniGatherKernel::generate() { cmp(regSpecIdxSizeB, vlen); jl(lLessThanVector1, T_NEAR); - uni_vmovd(reg32IdxIter, xmmSpecIdxB); - fillVlenVector(); + uni_vmovd(reg32IdxIter, xmmSpecIdxB); + fillVlenVector(); - process(false, false); - jmp(lE1, T_NEAR); + process(false, false); + jmp(lE1, T_NEAR); L(lLessThanVector1); - mov(regAux1, ptr[regParams + GET_OFF(permIdxMask)]); - uni_vmovups(vmmPermIdxMask, ptr[regAux1]); - if (jcp.beforeAxisSize != 1lu) { - mov(regAux1, ptr[regParams + GET_OFF(beforeAxisDiff)]); - uni_vmovups(vmmBeforeAxDiffB, ptr[regAux1]); - if (jcp.dataTypeSize != 1) - uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size - } - mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); - uni_vpbroadcastd(vmmSrcAfterBatchSizeB, ptr[regAux1]); + mov(regAux1, ptr[regParams + GET_OFF(permIdxMask)]); + uni_vmovups(vmmPermIdxMask, ptr[regAux1]); + if (jcp.beforeAxisSize != 1lu) { + mov(regAux1, ptr[regParams + GET_OFF(beforeAxisDiff)]); + uni_vmovups(vmmBeforeAxDiffB, ptr[regAux1]); + if (jcp.dataTypeSize != 1) + uni_vpslld(vmmBeforeAxDiffB, vmmBeforeAxDiffB, dataTypeShift); // multiply by data type size + } + mov(regAux1, ptr[regParams + GET_OFF(srcAfterBatchSizeB)]); + uni_vpbroadcastd(vmmSrcAfterBatchSizeB, ptr[regAux1]); - process(true, false); + process(true, false); L(lE1); jmp(lEnd, T_NEAR); } - L(lBlock); { + L(lBlock); + { mov(regAux1, ptr[regParams + GET_OFF(start)]); uni_vpbroadcastd(vmmAfterAxisIdxB, ptr[regAux1]); mov(regAux1, reinterpret_cast(incVec)); @@ -246,40 +280,40 @@ void jitUniGatherKernel::generate() { uni_vroundps(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, 0x1); uni_vfnmadd231ps(vmmAfterAxisIdxB, vmmSrcBeforeAxisSumB, vAux1); uni_vcvtps2dq(vmmAfterAxisIdxB, vmmAfterAxisIdxB); - uni_vpslld(vmmAfterAxisIdxB, vmmAfterAxisIdxB, idxTypeShift); // multiply by indices type size. + uni_vpslld(vmmAfterAxisIdxB, vmmAfterAxisIdxB, idxTypeShift); // multiply by indices type size. Xbyak::Label lLessThanVector2, lTail3, lTail4, lE2; cmp(regAux2, dataElPerVec); jl(lLessThanVector2, T_NEAR); - uni_vmovd(reg32IdxIter, xmmSpecIdxB); - fillVlenVector(); + uni_vmovd(reg32IdxIter, xmmSpecIdxB); + fillVlenVector(); -// process(false, true); - jmp(lE2, T_NEAR); + // process(false, true); + jmp(lE2, T_NEAR); L(lLessThanVector2); - auto& vAux2 = vmmAuxContainer[2]; - // Calculate permute mask - uni_vmovd(xAux0, reg32Aux2); - uni_vpbroadcastd(vAux1, xAux0); - mov(regAux1, reinterpret_cast(&idxElPerVec)); - uni_vpbroadcastd(vAux0, ptr[regAux1]); - uni_vpsubd(vmmAfterAxisPermMask, vAux0, vAux1); - mov(regAux1, reinterpret_cast(incVec)); - uni_vpaddd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, ptr[regAux1]); - for (int i = 0; i < 6; i++) { - if (isa == x64::avx512_core) { - Xbyak::Opmask kMask2 = Xbyak::Opmask(vAux2.getIdx()); - vpcmpgtd(kMask2, vAux0, vmmAfterAxisPermMask); - uni_vpsubd(vmmAfterAxisPermMask | kMask2, vmmAfterAxisPermMask, vAux1); - } else { - vpcmpgtd(vAux2, vAux0, vmmAfterAxisPermMask); - vpandn(vAux2, vAux2, vAux1); - uni_vpsubd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, vAux2); - } + auto& vAux2 = vmmAuxContainer[2]; + // Calculate permute mask + uni_vmovd(xAux0, reg32Aux2); + uni_vpbroadcastd(vAux1, xAux0); + mov(regAux1, reinterpret_cast(&idxElPerVec)); + uni_vpbroadcastd(vAux0, ptr[regAux1]); + uni_vpsubd(vmmAfterAxisPermMask, vAux0, vAux1); + mov(regAux1, reinterpret_cast(incVec)); + uni_vpaddd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, ptr[regAux1]); + for (int i = 0; i < 6; i++) { + if (isa == x64::avx512_core) { + Xbyak::Opmask kMask2 = Xbyak::Opmask(vAux2.getIdx()); + vpcmpgtd(kMask2, vAux0, vmmAfterAxisPermMask); + uni_vpsubd(vmmAfterAxisPermMask | kMask2, vmmAfterAxisPermMask, vAux1); + } else { + vpcmpgtd(vAux2, vAux0, vmmAfterAxisPermMask); + vpandn(vAux2, vAux2, vAux1); + uni_vpsubd(vmmAfterAxisPermMask, vmmAfterAxisPermMask, vAux2); } + } - process(true, true); + process(true, true); L(lE2); } L(lEnd); @@ -323,7 +357,7 @@ void jitUniGatherKernel::normalizeRawIndices(Vmm& vRawIndices, } // Check boundaries. vpcmpgtd(kAuxMask, vmmAxisDim, vRawIndices); - vpcmpd(kDstMask | kAuxMask, vmmZeros, vRawIndices, 2); // 2 - LE + vpcmpd(kDstMask | kAuxMask, vmmZeros, vRawIndices, 2); // 2 - LE // Multiply by type size. if (jcp.dataTypeSize > 1) uni_vpslld(vRawIndices, vRawIndices, dataTypeShift); @@ -338,7 +372,7 @@ void jitUniGatherKernel::normWithUpperBound(Vmm& vTarget, Vmm& vMax, template <> void jitUniGatherKernel::normWithUpperBound(Vmm& vTarget, Vmm& vMax, Vmask& kAuxMask) { - vpcmpd(kAuxMask, vMax, vTarget, 2); // 2 -> LE + vpcmpd(kAuxMask, vMax, vTarget, 2); // 2 -> LE uni_vpsubd(vTarget | kAuxMask, vTarget, vMax); } @@ -359,77 +393,77 @@ void jitUniGatherKernel::calcSrcShiftLong(Vmm* vAuxPool, bool shiftFi add(regIdxIter, vlen); cmp(regIdxIter, regSpecIdxSizeB); jge(lIdxStride, T_NEAR); + if (jcp.batchDims > 0lu) { + uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); + uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + } else { + uni_vmovd(reg32Aux1, xmmSpecIdxB); + } + vmovdqu(vDstShifts, ptr[regIndices + regAux1]); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (jcp.beforeAxisSize != 1lu) + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + jmp(lExit, T_NEAR); + L(lIdxStride); + sub(regIdxIter, regSpecIdxSizeB); + vpcmpeqd(kDstMask, vAux0, vAux0); + if (shiftFirst) { + vpcmpgtd(vAux0, vmmSpecIdxSizeB, vmmSpecIdxB); + vpandn(vAux1, vAux0, vmmSpecIdxSizeB); + uni_vpsubd(vAux1, vmmSpecIdxB, vAux1); + if (jcp.batchDims > 0lu) + uni_vpaddd(vAux1, vmmIdxBatchSumB, vAux1); + uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); + } else { if (jcp.batchDims > 0lu) { - uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); - uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); } else { - uni_vmovd(reg32Aux1, xmmSpecIdxB); + uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); } - vmovdqu(vDstShifts, ptr[regIndices + regAux1]); normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - jmp(lExit, T_NEAR); - L(lIdxStride); - sub(regIdxIter, regSpecIdxSizeB); - vpcmpeqd(kDstMask, vAux0, vAux0); - if (shiftFirst) { - vpcmpgtd(vAux0, vmmSpecIdxSizeB, vmmSpecIdxB); - vpandn(vAux1, vAux0, vmmSpecIdxSizeB); - uni_vpsubd(vAux1, vmmSpecIdxB, vAux1); - if (jcp.batchDims > 0lu) - uni_vpaddd(vAux1, vmmIdxBatchSumB, vAux1); - uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); - } else { - if (jcp.batchDims > 0lu) { - uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); - } else { - uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); - } - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - uni_vpbroadcastd(vAux0, xmmSpecIdxB); - vpcmpgtd(vAux1, vAux0, vmmSpecIdxB); - vpandn(vAux0, vAux1, vmmSpecIdxSizeB); - uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vAux0); + uni_vpbroadcastd(vAux0, xmmSpecIdxB); + vpcmpgtd(vAux1, vAux0, vmmSpecIdxB); + vpandn(vAux0, vAux1, vmmSpecIdxSizeB); + uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vAux0); - if (jcp.beforeAxisSize != 1lu) { - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - vpandn(vAux0, vAux1, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vAux0); - } + if (jcp.beforeAxisSize != 1lu) { + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + vpandn(vAux0, vAux1, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vAux0); } + } - if (jcp.batchDims > 0lu) { - Xbyak::Label l1; - inc(regBetweenBatchAndAxisIter); - cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); - jl(l1, T_NEAR); - mov(regBetweenBatchAndAxisIter, 0); - if (shiftFirst) { - uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); - vpandn(vDstShifts, vAux0, vmmSpecIdxSizeB); - uni_vpaddd(vAux1, vAux1, vDstShifts); - } else { - vpandn(vAux0, vAux1, vmmSpecIdxSizeB); - uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vAux0); - } - L(l1); + if (jcp.batchDims > 0lu) { + Xbyak::Label l1; + inc(regBetweenBatchAndAxisIter); + cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); + jl(l1, T_NEAR); + mov(regBetweenBatchAndAxisIter, 0); + if (shiftFirst) { + uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); + vpandn(vDstShifts, vAux0, vmmSpecIdxSizeB); + uni_vpaddd(vAux1, vAux1, vDstShifts); + } else { + vpandn(vAux0, vAux1, vmmSpecIdxSizeB); + uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vAux0); } + L(l1); + } - if (shiftFirst) { - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (shiftFirst) { + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) { - vpandn(vAux0, vAux0, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vAux0, vAux0, vmmSrcBeforeAxisSumB); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + if (jcp.beforeAxisSize != 1lu) { + vpandn(vAux0, vAux0, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vAux0, vAux0, vmmSrcBeforeAxisSumB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vDstShifts, vDstShifts, vAux0); - } + uni_vpaddd(vDstShifts, vDstShifts, vAux0); } + } L(lExit); } @@ -451,81 +485,81 @@ void jitUniGatherKernel::calcSrcShiftLong(Vmm* vAuxPool, bool add(regIdxIter, vlen); cmp(regIdxIter, regSpecIdxSizeB); jge(lIdxStride, T_NEAR); + if (jcp.batchDims > 0lu) { + uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); + uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + } else { + uni_vmovd(reg32Aux1, xmmSpecIdxB); + } + vmovdqu64(vDstShifts, ptr[regIndices + regAux1]); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (jcp.beforeAxisSize != 1lu) + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + jmp(lExit, T_NEAR); + L(lIdxStride); + sub(regIdxIter, regSpecIdxSizeB); + vpcmpeqd(kDstMask, vDstShifts, vDstShifts); + if (shiftFirst) { + vpcmpd(kAuxMask1, vmmSpecIdxSizeB, vmmSpecIdxB, 2); // 2 -> LE if (jcp.batchDims > 0lu) { - uni_vpaddd(vDstShifts, vmmIdxBatchSumB, vmmSpecIdxB); - uni_vmovd(reg32Aux1, xmmAuxContainer[vDstShifts.getIdx()]); + uni_vpaddd(vAux1, vmmIdxBatchSumB, vmmSpecIdxB); + uni_vpsubd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); } else { - uni_vmovd(reg32Aux1, xmmSpecIdxB); + uni_vmovups(vAux1, vmmSpecIdxB); + uni_vpsubd(vAux1 | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); } - vmovdqu64(vDstShifts, ptr[regIndices + regAux1]); - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - jmp(lExit, T_NEAR); - L(lIdxStride); - sub(regIdxIter, regSpecIdxSizeB); - vpcmpeqd(kDstMask, vDstShifts, vDstShifts); - if (shiftFirst) { - vpcmpd(kAuxMask1, vmmSpecIdxSizeB, vmmSpecIdxB, 2); // 2 -> LE - if (jcp.batchDims > 0lu) { - uni_vpaddd(vAux1, vmmIdxBatchSumB, vmmSpecIdxB); - uni_vpsubd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); - } else { - uni_vmovups(vAux1, vmmSpecIdxB); - uni_vpsubd(vAux1 | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); - } - uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); + uni_vpsubd(vmmSpecIdxB, vmmSpecIdxB, vmmSpecIdxSizeB); + } else { + if (jcp.batchDims > 0lu) { + uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); } else { - if (jcp.batchDims > 0lu) { - uni_vpaddd(vAux0, vmmIdxBatchSumB, vmmSpecIdxB); - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux0], kDstMask); - } else { - uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); - } - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + uniVpGatherDd(vDstShifts, ptr[regIndices + vmmSpecIdxB], kDstMask); + } + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - uni_vpbroadcastd(vAux0, xmmSpecIdxB); - vpcmpd(kAuxMask1, vAux0, vmmSpecIdxB, 2); // 2 -> LE - uni_vpsubd(vmmSpecIdxB | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); + uni_vpbroadcastd(vAux0, xmmSpecIdxB); + vpcmpd(kAuxMask1, vAux0, vmmSpecIdxB, 2); // 2 -> LE + uni_vpsubd(vmmSpecIdxB | kAuxMask1, vmmSpecIdxB, vmmSpecIdxSizeB); - if (jcp.beforeAxisSize != 1lu) { - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - uni_vpaddd(vmmSrcBeforeAxisSumB | kAuxMask1, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - } + if (jcp.beforeAxisSize != 1lu) { + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + uni_vpaddd(vmmSrcBeforeAxisSumB | kAuxMask1, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); } + } - if (jcp.batchDims > 0lu) { - Xbyak::Label l1; - inc(regBetweenBatchAndAxisIter); - cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); - jl(l1, T_NEAR); - mov(regBetweenBatchAndAxisIter, 0); - if (shiftFirst) { - uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); - uni_vpaddd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); - } else { - uni_vpaddd(vmmIdxBatchSumB | kAuxMask1, vmmIdxBatchSumB, vmmSpecIdxSizeB); - } - L(l1); + if (jcp.batchDims > 0lu) { + Xbyak::Label l1; + inc(regBetweenBatchAndAxisIter); + cmp(regBetweenBatchAndAxisIter, regBetweenBatchAndAxisSize); + jl(l1, T_NEAR); + mov(regBetweenBatchAndAxisIter, 0); + if (shiftFirst) { + uni_vpaddd(vmmIdxBatchSumB, vmmIdxBatchSumB, vmmSpecIdxSizeB); + uni_vpaddd(vAux1 | kAuxMask1, vAux1, vmmSpecIdxSizeB); + } else { + uni_vpaddd(vmmIdxBatchSumB | kAuxMask1, vmmIdxBatchSumB, vmmSpecIdxSizeB); } + L(l1); + } - if (shiftFirst) { - uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); - normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); + if (shiftFirst) { + uniVpGatherDd(vDstShifts, ptr[regIndices + vAux1], kDstMask); + normalizeRawIndices(vDstShifts, kDstMask, kAuxMask0); - if (jcp.beforeAxisSize != 1lu) { - uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); - uni_vpaddd(vDstShifts | kAuxMask1, vDstShifts, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - } + if (jcp.beforeAxisSize != 1lu) { + uni_vpaddd(vDstShifts, vDstShifts, vmmSrcBeforeAxisSumB); + uni_vpaddd(vDstShifts | kAuxMask1, vDstShifts, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); } + } L(lExit); } template void jitUniGatherKernel::calcSrcShiftLongBlock(Vmm* vAuxPool, bool shiftFirst) { - // Most likely there will no significant performance gain vs memcpy in reference implementation on big blocks after axis, - // therefore no time was invested to this case yet. + // Most likely there will no significant performance gain vs memcpy in reference implementation on big blocks after + // axis, therefore no time was invested to this case yet. OPENVINO_THROW("Unsupported case."); } @@ -541,7 +575,8 @@ void jitUniGatherKernel::calcSrcShiftShort(Vmm* vAuxPool, bool shiftFirst) if (jcp.beforeAxisSize != 1lu) uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmBeforeAxDiffB); // No sense to permute if specIdxSize is one of {1, 2, 4, 8, 16}. 0 is reserved for dynamic case. - if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && jcp.specIdxSize != 16) { + if (jcp.specIdxSize != 1 && jcp.specIdxSize != 2 && jcp.specIdxSize != 4 && jcp.specIdxSize != 8 && + jcp.specIdxSize != 16) { vpermd(vmmSpecIdxB, vmmPermIdxMask, vmmSpecIdxB); if (jcp.beforeAxisSize != 1lu) vpermd(vmmBeforeAxDiffB, vmmPermIdxMask, vmmBeforeAxDiffB); @@ -588,7 +623,8 @@ void jitUniGatherKernel::calcSrcShiftShortBlock(Vmm* vAuxPool, bool shiftFi normWithUpperBound(vmmSpecIdxB, vmmSpecIdxSizeB, kAuxMask0); } // No sense to permute if afterAxisSize is one of {1, 2, 4, 8, 16}. 0 is reserved for dynamic case. - if (jcp.afterAxisSize != 1 && jcp.afterAxisSize != 2 && jcp.afterAxisSize != 4 && jcp.afterAxisSize != 8 && jcp.afterAxisSize != 16) { + if (jcp.afterAxisSize != 1 && jcp.afterAxisSize != 2 && jcp.afterAxisSize != 4 && jcp.afterAxisSize != 8 && + jcp.afterAxisSize != 16) { vpermd(vmmAfterAxisIdxB, vmmAfterAxisPermMask, vmmAfterAxisIdxB); if (jcp.specIdxSize != 1) vpermd(vmmSpecIdxDiff, vmmAfterAxisPermMask, vmmSpecIdxDiff); @@ -600,33 +636,33 @@ void jitUniGatherKernel::calcSrcShiftShortBlock(Vmm* vAuxPool, bool shiftFi uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmBeforeAxDiffB); uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); if (specIdxAndAfterAxisSize != 1 && specIdxAndAfterAxisSize != 2 && specIdxAndAfterAxisSize != 4 && - specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) + specIdxAndAfterAxisSize != 8 && specIdxAndAfterAxisSize != 16) vpermd(vmmBeforeAxDiffB, vmmBeforeAxPermMask, vmmBeforeAxDiffB); } else { Xbyak::Label lBeforeAxStep, lBeforeAxStepEnd; add(rSpecIdxAndAfterAxIterB, idxElPerVec * jcp.dataTypeSize); cmp(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); jl(lBeforeAxStep, T_NEAR); - sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); - - vpmulld(vAux0, vmmSpecIdxB, vmmAfterAxisSize); - uni_vpaddd(vAux0, vAux0, vmmAfterAxisIdxB); - Xbyak::Xmm& xAux0 = xmmAuxContainer[vAux0.getIdx()]; - uni_vpbroadcastd(vAux1, xAux0); - if (isa == x64::avx512_core) { - Xbyak::Opmask kMask0 = Xbyak::Opmask(kAuxMask0.getIdx()); - vpcmpgtd(kMask0, vAux1, vAux0); - uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); - uni_vpaddd(vAux1 | kMask0, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - } else { - vpcmpgtd(vAux1, vAux1, vAux0); - vpand(vAux1, vAux1, vmmAxisAndAfterAxisSizeB); - uni_vpaddd(vAux1, vmmSrcBeforeAxisSumB, vAux1); - } - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); - jmp(lBeforeAxStepEnd); - L(lBeforeAxStep); + sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); + + vpmulld(vAux0, vmmSpecIdxB, vmmAfterAxisSize); + uni_vpaddd(vAux0, vAux0, vmmAfterAxisIdxB); + Xbyak::Xmm& xAux0 = xmmAuxContainer[vAux0.getIdx()]; + uni_vpbroadcastd(vAux1, xAux0); + if (isa == x64::avx512_core) { + Xbyak::Opmask kMask0 = Xbyak::Opmask(kAuxMask0.getIdx()); + vpcmpgtd(kMask0, vAux1, vAux0); uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); + uni_vpaddd(vAux1 | kMask0, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + } else { + vpcmpgtd(vAux1, vAux1, vAux0); + vpand(vAux1, vAux1, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vAux1, vmmSrcBeforeAxisSumB, vAux1); + } + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + jmp(lBeforeAxStepEnd); + L(lBeforeAxStep); + uni_vmovups(vAux1, vmmSrcBeforeAxisSumB); L(lBeforeAxStepEnd); } } else { @@ -648,10 +684,10 @@ void jitUniGatherKernel::calcSrcShiftShortBlock(Vmm* vAuxPool, bool shiftFi add(rSpecIdxAndAfterAxIterB, idxElPerVec * jcp.dataTypeSize); cmp(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); jl(lBeforeAxStepEnd1, T_NEAR); - sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); + sub(rSpecIdxAndAfterAxIterB, rSpecIdxAndAfterAxSizeB); cmp(rSpecIdxAndAfterAxIterB, 0); jne(lBeforeAxStepEnd1, T_NEAR); - uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); + uni_vpaddd(vmmSrcBeforeAxisSumB, vmmSrcBeforeAxisSumB, vmmAxisAndAfterAxisSizeB); L(lBeforeAxStepEnd1); } } @@ -689,15 +725,15 @@ void jitUniGatherKernel::process(bool isShortIdx, bool blocked) { Xbyak::Label lTailProc, lEndProc; cmp(regWorkAmount, dataElPerVec); jl(lTailProc, T_NEAR); - if (jcp.dataTypeSize == 4) - process32b(isShortIdx, blocked); - else if (jcp.dataTypeSize == 2) - process16b(isShortIdx, blocked); - else if (jcp.dataTypeSize == 1) - process8b(isShortIdx, blocked); + if (jcp.dataTypeSize == 4) + process32b(isShortIdx, blocked); + else if (jcp.dataTypeSize == 2) + process16b(isShortIdx, blocked); + else if (jcp.dataTypeSize == 1) + process8b(isShortIdx, blocked); jmp(lEndProc, T_NEAR); L(lTailProc); - tail(isShortIdx, false, blocked); + tail(isShortIdx, false, blocked); L(lEndProc); } @@ -735,11 +771,11 @@ void jitUniGatherKernel::process16b(bool isShortIdx, bool blocked) { if (isa == x64::avx512_core) { vPermMask = vmmAuxContainer[7]; vShufMask = vmmAuxContainer[8]; - vBuff0 = vmmAuxContainer[9]; + vBuff0 = vmmAuxContainer[9]; } else { vPermMask = vmmAuxContainer[1]; vShufMask = vmmAuxContainer[4]; - vBuff0 = vmmAuxContainer[5]; + vBuff0 = vmmAuxContainer[5]; } mov(regAux1, reinterpret_cast(shufMask16bitUni)); @@ -799,13 +835,13 @@ void jitUniGatherKernel::process8b(bool isShortIdx, bool blocked) { if (isa == x64::avx512_core) { vPermMask = vmmAuxContainer[7]; vShufMask = vmmAuxContainer[8]; - vBuff0 = vmmAuxContainer[9]; - vBuff1 = vmmAuxContainer[10]; + vBuff0 = vmmAuxContainer[9]; + vBuff1 = vmmAuxContainer[10]; } else { vPermMask = vmmAuxContainer[1]; vShufMask = vmmAuxContainer[4]; - vBuff0 = vmmAuxContainer[5]; - vBuff1 = vmmAuxContainer[6]; + vBuff0 = vmmAuxContainer[5]; + vBuff1 = vmmAuxContainer[6]; } mov(regAux1, reinterpret_cast(shufMask8bitUni)); uni_vmovups(vShufMask, ptr[regAux1]); @@ -951,24 +987,30 @@ void jitUniGatherKernel::tail(bool isShortIdx, bool shiftFirst, bool blocke } template <> -void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, Vmm& vmmAux, const Xbyak::Reg64& rWorkRest, - const Xbyak::Reg64& rAux0, const Xbyak::Reg64& rAux1) { +void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, + Vmm& vmmAux, + const Xbyak::Reg64& rWorkRest, + const Xbyak::Reg64& rAux0, + const Xbyak::Reg64& rAux1) { Xbyak::Label lKmov; Xbyak::Reg32 rOnes(rAux1.getIdx()); mov(rOnes, 0x0000FFFF); cmp(rWorkRest, idxElPerVec); jge(lKmov); - Xbyak::Reg8 rShift(Xbyak::Operand::CL); - mov(rShift, idxElPerVec); - sub(rShift, rWorkRest); - shr(rOnes, rShift); + Xbyak::Reg8 rShift(Xbyak::Operand::CL); + mov(rShift, idxElPerVec); + sub(rShift, rWorkRest); + shr(rOnes, rShift); L(lKmov); kmovw(kDstMask, rOnes); } template <> -void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, Vmm& vAux, const Xbyak::Reg64& rWorkRest, - const Xbyak::Reg64& rAux0, const Xbyak::Reg64& rAux1) { +void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, + Vmm& vAux, + const Xbyak::Reg64& rWorkRest, + const Xbyak::Reg64& rAux0, + const Xbyak::Reg64& rAux1) { Xbyak::Label lEnd; mov(rAux0, rWorkRest); Xbyak::Reg32 rOnes(rAux1.getIdx()); @@ -990,7 +1032,10 @@ void jitUniGatherKernel::fillRestWorkMask(Vmask& kDstMask, Vmm& vAux, } template -void jitUniGatherKernel::storeVectorPart(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rToStoreCounter, Vmm& vmmSrc, Vmm& vAux) { +void jitUniGatherKernel::storeVectorPart(const Xbyak::Reg64& rDst, + const Xbyak::Reg64& rToStoreCounter, + Vmm& vmmSrc, + Vmm& vAux) { Xbyak::Label lEnd; Xbyak::Xmm xAux(vAux.getIdx()); for (size_t j = 0; j < vlen / vlenXmm; j++) { @@ -1025,7 +1070,7 @@ void jitUniGatherKernel::fillVlenVector() { template <> void jitUniGatherKernel::fillVlenVector() { vpcmpeqd(vmmVecLenB, vmmVecLenB, vmmVecLenB); - vpsrld(vmmVecLenB, vmmVecLenB, 31); // Right shift to 1. + vpsrld(vmmVecLenB, vmmVecLenB, 31); // Right shift to 1. uni_vpslld(vmmVecLenB, vmmVecLenB, 5); // Left shift to 32. } @@ -1047,5 +1092,5 @@ bool jitUniGatherKernel::isSupportedConfiguration(uint64_t afterAxisSize) { template struct jitUniGatherKernel; template struct jitUniGatherKernel; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp index 765efb17d091e2..de8cda30d06499 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/gather_uni_kernel.hpp @@ -19,12 +19,11 @@ // 1 | X | X | X | X | X | X | //-------------------------------------------------------------- - #pragma once -#include "jit_kernel_base.hpp" #include "cpu/x64/jit_generator.hpp" #include "dnnl_types.h" +#include "jit_kernel_base.hpp" namespace ov { namespace intel_cpu { @@ -71,8 +70,8 @@ struct gatherJitExecArgs { }; struct jitGatherKernelBase { - void (*ker_)(const gatherJitExecArgs *); - void operator()(const gatherJitExecArgs *args) { + void (*ker_)(const gatherJitExecArgs*); + void operator()(const gatherJitExecArgs* args) { assert(ker_); ker_(args); } @@ -120,8 +119,10 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: bool isSupportedConfiguration(uint64_t afterAxisSize) override; protected: - using Vmm = typename dnnl::impl::utils::conditional::type; - using Vmask = typename dnnl::impl::utils::conditional::type; + using Vmm = + typename dnnl::impl::utils::conditional::type; + using Vmask = + typename dnnl::impl::utils::conditional::type; static const uint32_t vlenXmm = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; static const uint32_t indicesTypeSize = sizeof(uint32_t); static const uint8_t idxTypeShift = 2; @@ -155,7 +156,8 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: // Masks pool. Do not use k0 with gather instruction! Vmask masksContainer[8] = {Vmask(0), Vmask(1), Vmask(2), Vmask(3), Vmask(4), Vmask(5), Vmask(6), Vmask(7)}; // Auxiliary pool. - Vmm vmmAuxContainer[12] = {Vmm(0), Vmm(1), Vmm(2), Vmm(3), Vmm(4), Vmm(5), Vmm(6), /*AVX5*/ Vmm(16), Vmm(17), Vmm(18), Vmm(19), Vmm(20)}; + Vmm vmmAuxContainer[12] = + {Vmm(0), Vmm(1), Vmm(2), Vmm(3), Vmm(4), Vmm(5), Vmm(6), /*AVX5*/ Vmm(16), Vmm(17), Vmm(18), Vmm(19), Vmm(20)}; // Common. Vmm vmmZeros = Vmm(7); Vmm vmmSrcBeforeAxisSumB = Vmm(8); @@ -165,13 +167,13 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: Vmm vmmAxisAndAfterAxisSizeB = Vmm(12); // Only short. - Vmm vmmSrcAfterBatchSizeB = Vmm(13); - Vmm vmmPermIdxMask = Vmm(14); + Vmm vmmSrcAfterBatchSizeB = Vmm(13); + Vmm vmmPermIdxMask = Vmm(14); Vmm& vmmBeforeAxDiffB = vmmAxisAndAfterAxisSizeB; // Blocked short. Vmm& vmmSpecIdxDiff = vmmAuxContainer[4]; Vmm& vmmAfterAxisSize = vmmAuxContainer[5]; - Vmm vmmAfterAxisIdxB = Vmm(15); + Vmm vmmAfterAxisIdxB = Vmm(15); Vmm& vmmAfterAxisPermMask = vmmPermIdxMask; Vmm& vmmBeforeAxPermMask = vmmAuxContainer[6]; // Only long. @@ -179,13 +181,13 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: Vmm vmmIdxBatchSumB = Vmm(14); // XMM - Xbyak::Xmm xmmAuxContainer[6] = {Xbyak::Xmm(0), Xbyak::Xmm(1), Xbyak::Xmm(2), Xbyak::Xmm(3), Xbyak::Xmm(4), Xbyak::Xmm(16)}; + Xbyak::Xmm xmmAuxContainer[6] = + {Xbyak::Xmm(0), Xbyak::Xmm(1), Xbyak::Xmm(2), Xbyak::Xmm(3), Xbyak::Xmm(4), Xbyak::Xmm(16)}; Xbyak::Xmm xmmZeros = Xbyak::Xmm(vmmZeros.getIdx()); Xbyak::Xmm xmmSrcBeforeAxisSum = Xbyak::Xmm(vmmSrcBeforeAxisSumB.getIdx()); Xbyak::Xmm xmmSpecIdxSizeB = Xbyak::Xmm(vmmSpecIdxSizeB.getIdx()); Xbyak::Xmm xmmSpecIdxB = Xbyak::Xmm(vmmSpecIdxB.getIdx()); - void calcSrcShiftLong(Vmm* vAuxPool, bool shiftFirst = true); void calcSrcShiftLongBlock(Vmm* vAuxPool, bool shiftFirst = true); void calcSrcShiftShort(Vmm* vAuxPool, bool shiftFirst = true); @@ -199,7 +201,11 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: // Aux functions. void normalizeRawIndices(Vmm& rawIndices, Vmask& dstMask, Vmask& aux); void normWithUpperBound(Vmm& vTarget, Vmm& vMax, Vmask& kAuxMask); - void fillRestWorkMask(Vmask& kMask, Vmm& vAux, const Xbyak::Reg64& rWorkRest, const Xbyak::Reg64& rAux0, const Xbyak::Reg64& rAux1); + void fillRestWorkMask(Vmask& kMask, + Vmm& vAux, + const Xbyak::Reg64& rWorkRest, + const Xbyak::Reg64& rAux0, + const Xbyak::Reg64& rAux1); void storeVectorPart(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rToStoreCounter, Vmm& vmmSrc, Vmm& vAux); void uniVpGatherDd(Vmm& vDst, const Xbyak::Address& srcAddr, Vmask& vMask); void fillVlenVector(); @@ -208,5 +214,5 @@ struct jitUniGatherKernel : public jitGatherKernelBase, public dnnl::impl::cpu:: const unsigned* permMask16bitUni; }; -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp index d91688689b86c0..908de00cbb0534 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.cpp @@ -13,8 +13,8 @@ namespace kernel { #define GET_OFF(field) offsetof(GridSamplesKernelExecArgs, field) template -GridSampleKernel::GridSampleKernel(const GridSampleKernelConfParams& jcp) : - GridSampleKernelBase(jit_name(), jcp, isa) { +GridSampleKernel::GridSampleKernel(const GridSampleKernelConfParams& jcp) + : GridSampleKernelBase(jit_name(), jcp, isa) { vlen = x64::cpu_isa_traits::vlen; dataTypeSize = jcp.inDataPrc.size(); gridTypeSize = jcp.gridPrc.size(); @@ -39,15 +39,15 @@ void GridSampleKernel::generate() { this->preamble(); registersPool = RegistersPool::create(isa, {rax, rcx, rsp, rdi, k0}); - regSrc = getReg64(); + regSrc = getReg64(); regGrid = getReg64(); - regDst = getReg64(); + regDst = getReg64(); regSrcChannelStepB = getReg64(); regDstChannelStepB = getReg64(); - mov(regSrc, ptr[regParams + GET_OFF(src)]); + mov(regSrc, ptr[regParams + GET_OFF(src)]); mov(regGrid, ptr[regParams + GET_OFF(grid)]); - mov(regDst, ptr[regParams + GET_OFF(dst)]); + mov(regDst, ptr[regParams + GET_OFF(dst)]); mov(regSrcChannelStepB, ptr[regParams + GET_OFF(srcChannelStepB)]); mov(regDstChannelStepB, ptr[regParams + GET_OFF(dstChannelStepB)]); @@ -82,7 +82,7 @@ void GridSampleKernel::initVectors() { if (one_of(jcp.interpolationMode, GridSampleInterpolationMode::BICUBIC, GridSampleInterpolationMode::BILINEAR)) { vOnesF = getVmm(); - mov(r32Aux, 0x3f800000); // 1.f + mov(r32Aux, 0x3f800000); // 1.f vpbroadcastd(vOnesF, r32Aux); } @@ -96,11 +96,11 @@ void GridSampleKernel::initVectors() { uni_vpbroadcastd(vHDenormCoefF, ptr[rAux]); } else { vHalfF = getVmm(); - mov(r32Aux, 0x3f000000); // 0.5f + mov(r32Aux, 0x3f000000); // 0.5f vpbroadcastd(vHalfF, r32Aux); } - static const unsigned gridPermMask[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; + static const unsigned gridPermMask[16] = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; mov(rAux, reinterpret_cast(gridPermMask)); vGridPermMask = getVmm(); uni_vmovups(vGridPermMask, ptr[rAux]); @@ -141,24 +141,24 @@ void GridSampleKernel::initVectors() { if (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC) { vConst_0_75 = getVmm(); - mov(r32Aux, 0xbf400000); // -0.75f + mov(r32Aux, 0xbf400000); // -0.75f vpbroadcastd(vConst_0_75, r32Aux); vConst_1_25 = getVmm(); - mov(r32Aux, 0x3fa00000); // 1.25f + mov(r32Aux, 0x3fa00000); // 1.25f vpbroadcastd(vConst_1_25, r32Aux); vConst_1_50 = getVmm(); - mov(r32Aux, 0x3fc00000); // 1.5f + mov(r32Aux, 0x3fc00000); // 1.5f vpbroadcastd(vConst_1_50, r32Aux); vConst_2_00 = getVmm(); - mov(r32Aux, 0x40000000); // 2.0f + mov(r32Aux, 0x40000000); // 2.0f vpbroadcastd(vConst_2_00, r32Aux); vConst_2_25 = getVmm(); - mov(r32Aux, 0x40100000); // 2.25f + mov(r32Aux, 0x40100000); // 2.25f vpbroadcastd(vConst_2_25, r32Aux); } } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::initVectors() { auto rAux = getReg64(); @@ -167,9 +167,10 @@ void GridSampleKernel::initVectors() { uni_vmovups(vSrcWidthF, ptr[rAux]); if (one_of(jcp.interpolationMode, GridSampleInterpolationMode::BILINEAR, GridSampleInterpolationMode::NEAREST) || - (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC && (jcp.paddingMode == GridSamplePaddingMode::REFLECTION || - (jcp.paddingMode == GridSamplePaddingMode::BORDER && !jcp.alignCorners) || - jcp.paddingMode == GridSamplePaddingMode::ZEROS)) ) { + (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC && + (jcp.paddingMode == GridSamplePaddingMode::REFLECTION || + (jcp.paddingMode == GridSamplePaddingMode::BORDER && !jcp.alignCorners) || + jcp.paddingMode == GridSamplePaddingMode::ZEROS))) { vSrcHeightF = getVmm(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); uni_vmovups(vSrcHeightF, ptr[rAux]); @@ -184,7 +185,8 @@ void GridSampleKernel::initVectors() { if (jcp.interpolationMode != GridSampleInterpolationMode::BICUBIC) { if (one_of(jcp.paddingMode, GridSamplePaddingMode::BORDER, GridSamplePaddingMode::ZEROS) && - ((isa == x64::avx2 && jcp.interpolationMode == GridSampleInterpolationMode::NEAREST) || one_of(isa, x64::avx, x64::sse41))) { + ((isa == x64::avx2 && jcp.interpolationMode == GridSampleInterpolationMode::NEAREST) || + one_of(isa, x64::avx, x64::sse41))) { vZeros = getVmm(); uni_vpxor(vZeros, vZeros, vZeros); } @@ -193,20 +195,21 @@ void GridSampleKernel::initVectors() { mov(rAux, ptr[regParams + GET_OFF(wDenormCoefF)]); vWDenormCoefF = getVmm(); uni_vmovups(vWDenormCoefF, ptr[rAux]); - if (!(jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && jcp.paddingMode == GridSamplePaddingMode::ZEROS)) { + if (!(jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && + jcp.paddingMode == GridSamplePaddingMode::ZEROS)) { mov(rAux, ptr[regParams + GET_OFF(hDenormCoefF)]); vHDenormCoefF = getVmm(); uni_vmovups(vHDenormCoefF, ptr[rAux]); } } else { - static const float halfArr[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + static const float halfArr[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; mov(rAux, reinterpret_cast(halfArr)); vHalfF = getVmm(); uni_vmovups(vHalfF, ptr[rAux]); } if (isa == x64::avx2 && jcp.interpolationMode == GridSampleInterpolationMode::NEAREST) { - static const unsigned gridPermMask[8] = { 0, 2, 4, 6, 1, 3, 5, 7 }; + static const unsigned gridPermMask[8] = {0, 2, 4, 6, 1, 3, 5, 7}; mov(rAux, reinterpret_cast(gridPermMask)); vGridPermMask = getVmm(); uni_vmovups(vGridPermMask, ptr[rAux]); @@ -214,15 +217,16 @@ void GridSampleKernel::initVectors() { } if (jcp.interpolationMode == GridSampleInterpolationMode::BICUBIC || - (jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && jcp.paddingMode != GridSamplePaddingMode::ZEROS)) { - static const float onesArr[8] = { 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f }; + (jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR && + jcp.paddingMode != GridSamplePaddingMode::ZEROS)) { + static const float onesArr[8] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}; mov(rAux, reinterpret_cast(onesArr)); vOnesF = getVmm(); uni_vmovups(vOnesF, ptr[rAux]); } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::process() { regWorkAmount = getReg64(); @@ -244,12 +248,12 @@ void GridSampleKernel::process() { spatialLoop(); if (jcp.dynamicShapes) { - add(regSrc, ptr[regParams + GET_OFF(srcBatchStepB)]); + add(regSrc, ptr[regParams + GET_OFF(srcBatchStepB)]); } else { add(regSrc, jcp.srcBatchStepB); } add(regGrid, ptr[regParams + GET_OFF(gridBatchStepB)]); - add(regDst, ptr[regParams + GET_OFF(dstBatchStepB)]); + add(regDst, ptr[regParams + GET_OFF(dstBatchStepB)]); if (jcp.dynamicBatch) { dec(regBatch); @@ -259,7 +263,7 @@ void GridSampleKernel::process() { } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::spatialLoop() { auto vHCoord = getVmm(); auto vWCoord = getVmm(); @@ -286,7 +290,7 @@ void GridSampleKernel::spatialLoop() { tail(); } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::interpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { if (jcp.interpolationMode == GridSampleInterpolationMode::BILINEAR) { bilinearInterpolation(vWCoord, vHCoord, tail); @@ -297,7 +301,7 @@ void GridSampleKernel::interpolation(const Vmm& vWCoord, const Vmm& vHCoord } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::tail() { Xbyak::Label lEnd; cmp(regWorkAmount, 0); @@ -311,7 +315,7 @@ void GridSampleKernel::tail() { interpolation(vWCoord, vHCoord, true); if (dataTypeSize > 1) - sal(regWorkAmount, dataTypeShift); // Multiply by source data type size. + sal(regWorkAmount, dataTypeShift); // Multiply by source data type size. add(regDst, regWorkAmount); L(lEnd); @@ -319,15 +323,15 @@ void GridSampleKernel::tail() { template <> void GridSampleKernel::getCoordinates(const Vmm& vHCoord, const Vmm& vWCoord) { - vpermd(vWCoord, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY - vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component + vpermd(vWCoord, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY + vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component add(regGrid, vlen); auto vAux = getVmm(); - vpermd(vAux, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY - vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component - vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component + vpermd(vAux, vGridPermMask, ptr[regGrid]); // Permute to XXXX.XXXX.YYYY.YYYY + vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component + vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component add(regGrid, vlen); } @@ -349,19 +353,19 @@ void GridSampleKernel::getCoordinates(const Vmm& vHCoord, const Vmm& uni_vmovups(vPermMask, ptr[rAux]); } - vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY - vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component + vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY + vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component add(regGrid, vlen); - vpermd(vAux, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY - vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component - vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component + vpermd(vAux, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY + vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component + vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component add(regGrid, vlen); } -template // Works for AVX, SSE41 +template // Works for AVX, SSE41 void GridSampleKernel::getCoordinates(const Vmm& vHCoord, const Vmm& vWCoord) { auto vAux = getVmm(); Xbyak::Xmm xmmWCoord(vWCoord.getIdx()); @@ -417,12 +421,12 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, auto rAux = getReg64(); mov(rAux, regWorkAmount); - sal(rAux, 0x1); // Multiply by gridShape[3]. + sal(rAux, 0x1); // Multiply by gridShape[3]. cmp(regWorkAmount, dataElPerVec / 2); jl(lRest, T_NEAR); { vpermd(vWCoord, vGridPermMask, ptr[regGrid]); - vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component + vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component add(regGrid, vlen); sub(rAux, dataElPerVec); @@ -433,8 +437,8 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, uni_vmovups((Vmm)vAux | kTailMask, ptr[regGrid]); vpermd(vAux, vGridPermMask, vAux); Xbyak::Ymm ymmAux(vAux.getIdx()); - vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component - vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component + vshuff64x2(vWCoord, vWCoord, vAux, 0B01000100); // Extract X component + vshuff64x2(vHCoord, vHCoord, vAux, 0B11100100); // Extract Y component jmp(lGridShift, T_NEAR); } @@ -443,12 +447,12 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, fillRestWorkMask(kTailMask, rAux); uni_vmovups(vWCoord | kTailMask, ptr[regGrid]); vpermd(vWCoord, vGridPermMask, vWCoord); - vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component + vshuff64x2(vHCoord, vWCoord, vHCoord, 0B11101110); // Extract Y component } L(lGridShift); if (dataTypeSize > 1) - sal(rAux, dataTypeShift); // Multiply by source data type size. + sal(rAux, dataTypeShift); // Multiply by source data type size. add(regGrid, rAux); L(lEnd); @@ -475,36 +479,36 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const V } mov(rAux, regWorkAmount); - sal(rAux, 0x1); // multiply by gridShape[3] == 2 + sal(rAux, 0x1); // multiply by gridShape[3] == 2 cmp(regWorkAmount, dataElPerVec / 2); jl(lRest, T_NEAR); { - vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY - vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component + vpermd(vWCoord, vPermMask, ptr[regGrid]); // Permute to XXXX.YYYY + vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component add(regGrid, vlen); sub(rAux, dataElPerVec); cmp(rAux, 0); jle(lEnd, T_NEAR); - auto vAux = getVmm(); + auto vAux = getVmm(); load(vAux, ptr[regGrid], rAux, dataTypeSize); vpermd(vAux, vPermMask, vAux); - vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component - vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component + vperm2f128(vWCoord, vWCoord, vAux, 0B00100000); // Extract X component + vperm2f128(vHCoord, vHCoord, vAux, 0B00110000); // Extract Y component jmp(lGridShift, T_NEAR); } L(lRest); { load(vWCoord, ptr[regGrid], rAux, dataTypeSize); - vpermd(vWCoord, vPermMask, vWCoord); // Permute to XXXX.YYYY - vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component + vpermd(vWCoord, vPermMask, vWCoord); // Permute to XXXX.YYYY + vperm2f128(vHCoord, vHCoord, vWCoord, 0B00000011); // Extract Y component } L(lGridShift); if (dataTypeSize > 1) - sal(rAux, dataTypeShift); // Multiply by source data type size. + sal(rAux, dataTypeShift); // Multiply by source data type size. add(regGrid, rAux); L(lEnd); @@ -519,7 +523,7 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const Vm auto rGridRest = getReg64(); mov(rGridRest, regWorkAmount); - sal(rGridRest, 0x1); // multiply by gridShape[3] == 2 + sal(rGridRest, 0x1); // multiply by gridShape[3] == 2 for (size_t i = 0; i < dataElPerVec; i++) { cmp(rGridRest, 0); @@ -566,7 +570,7 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const auto rAux = getReg64(); mov(rAux, regWorkAmount); - sal(rAux, 0x1); // Multiply by gridShape[3] == 2 + sal(rAux, 0x1); // Multiply by gridShape[3] == 2 cmp(regWorkAmount, dataElPerVec / 2); jl(lRest, T_NEAR); { @@ -584,31 +588,31 @@ void GridSampleKernel::getTailCoordinates(const Vmm& vHCoord, const auto vAux = getVmm(); load(vAux, ptr[regGrid], rAux, dataTypeSize); pshufd(vAux, vAux, 0B11011000); - shufpd(vWCoord, vAux, 0x0); // Extract X component - shufpd(vHCoord, vAux, 0B00000011); // Extract Y component + shufpd(vWCoord, vAux, 0x0); // Extract X component + shufpd(vHCoord, vAux, 0B00000011); // Extract Y component jmp(lGridShift, T_NEAR); L(lHShuf); - shufpd(vHCoord, vHCoord, 0B00000001); // Extract Y component + shufpd(vHCoord, vHCoord, 0B00000001); // Extract Y component jmp(lEnd, T_NEAR); } L(lRest); { load(vWCoord, ptr[regGrid], rAux, dataTypeSize); - pshufd(vWCoord, vWCoord, 0B11011000); // Extract X component - shufpd(vHCoord, vWCoord, 0B00000010); // Extract Y component + pshufd(vWCoord, vWCoord, 0B11011000); // Extract X component + shufpd(vHCoord, vWCoord, 0B00000010); // Extract Y component shufpd(vHCoord, vHCoord, 0B00000001); } L(lGridShift); if (dataTypeSize > 1) - sal(rAux, dataTypeShift); // Multiply by source data type size. + sal(rAux, dataTypeShift); // Multiply by source data type size. add(regGrid, rAux); L(lEnd); } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::denormalizeRawCoordinates(const Vmm& vWCoord, const Vmm& vHCoord) { if (jcp.alignCorners) { if (vWDenormCoefF.isInitialized()) { @@ -640,7 +644,7 @@ void GridSampleKernel::denormalizeRawCoordinates(const Vmm& vWCoord, const halfHolder = getVmm(); vHalfTmp = halfHolder; static const float halfValues[x64::cpu_isa_traits::vlen / sizeof(float)] = - { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; mov(rAux, reinterpret_cast(halfValues)); uni_vmovups(vHalfTmp, ptr[rAux]); } @@ -671,14 +675,14 @@ void GridSampleKernel::denormalizeRawCoordinates(const Vmm& vWCoord, const template <> void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) { - vcmpps(kDst, vCoord, vSrcWidthF, CMP_LT_PS); // vCoord < vUpperBound - vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros + vcmpps(kDst, vCoord, vSrcWidthF, CMP_LT_PS); // vCoord < vUpperBound + vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros } template <> void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, const Vmask& kMaskW) { - vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, CMP_LT_PS); // vCoord < vUpperBound - vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros + vcmpps(kDst | kMaskW, vCoord, vSrcHeightF, CMP_LT_PS); // vCoord < vUpperBound + vcmpps(kDst | kDst, vZeros, vCoord, CMP_LE_PS); // vCoord >= vZeros } template <> @@ -692,7 +696,7 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& v auto vAux = getVmm(); if (vSrcWidthF.isInitialized()) { - uni_vcmpps(vAux, vWCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vWCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); @@ -700,8 +704,8 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& v } uni_vpxor(kDst, kDst, kDst); - uni_vcmpps(kDst, kDst, vWCoord, CMP_LE_PS); // vWCoord >= vZeros - uni_vpand(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF + uni_vcmpps(kDst, kDst, vWCoord, CMP_LE_PS); // vWCoord >= vZeros + uni_vpand(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF } template <> @@ -709,18 +713,18 @@ void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& v auto vAux = getVmm(); if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux, vHCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vHCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux, vHCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vHCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF } uni_vmovups(kDst, kMaskW); - uni_vpand(kDst, kDst, vAux); // vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF + uni_vpand(kDst, kDst, vAux); // vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF uni_vpxor(vAux, vAux, vAux); - uni_vcmpps(vAux, vAux, vHCoord, CMP_LE_PS); // vHCoord >= vZeros - uni_vpand(kDst, kDst, vAux); // vZeros <= vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF + uni_vcmpps(vAux, vAux, vHCoord, CMP_LE_PS); // vHCoord >= vZeros + uni_vpand(kDst, kDst, vAux); // vZeros <= vHCoord < vSrcHeightF && vZeros <= vWCoord < vSrcWidthF } template <> @@ -729,7 +733,7 @@ void GridSampleKernel::zerosPadding(const Vmask& kDst, const Vmm& vH zerosPaddingH(kDst, vHCoord, kDst); } -template // Works for AVX2, AVX +template // Works for AVX2, AVX void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) { auto vAux = getVmm(); Vmm vZerosTmp; @@ -743,18 +747,18 @@ void GridSampleKernel::zerosPaddingW(const Vmask& kDst, const Vmm& vCoord) } if (vSrcWidthF.isInitialized()) { - uni_vcmpps(vAux, vCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF + uni_vcmpps(vAux, vCoord, vSrcWidthF, CMP_LT_PS); // vWCoord < vSrcWidthF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vWCoord < vSrcWidthF } - uni_vcmpps(kDst, vZerosTmp, vCoord, CMP_LE_PS); // vWCoord >= vZeros - uni_vandps(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF + uni_vcmpps(kDst, vZerosTmp, vCoord, CMP_LE_PS); // vWCoord >= vZeros + uni_vandps(kDst, kDst, vAux); // vZeros <= vWCoord < vSrcWidthF } -template // Works for AVX2, AVX +template // Works for AVX2, AVX void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, const Vmask& kMaskW) { auto vAux = getVmm(); Vmm vZerosTmp; @@ -768,19 +772,19 @@ void GridSampleKernel::zerosPaddingH(const Vmask& kDst, const Vmm& vCoord, } if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux, vCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vCoord, vSrcHeightF, CMP_LT_PS); // vHCoord < vSrcHeightF } else { auto rAux = getReg64(); mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF + uni_vcmpps(vAux, vCoord, ptr[rAux], CMP_LT_PS); // vHCoord < vSrcHeightF } uni_vandps(kDst, kMaskW, vAux); - uni_vcmpps(vAux, vZerosTmp, vCoord, CMP_LE_PS); // vHCoord >= vZeros + uni_vcmpps(vAux, vZerosTmp, vCoord, CMP_LE_PS); // vHCoord >= vZeros uni_vandps(kDst, kDst, vAux); } -template // Works for AVX2, AVX +template // Works for AVX2, AVX void GridSampleKernel::zerosPadding(const Vmask& kDst, const Vmm& vHCoord, const Vmm& vWCoord) { bool releaseZeroVec = false; if (!vZeros.isInitialized()) { @@ -799,11 +803,14 @@ void GridSampleKernel::zerosPadding(const Vmask& kDst, const Vmm& vHCoord, template <> void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { - vrangeps(vCoordDst, vCoordOrigin, dim == coord::w ? vSrcWidthSub1F : vSrcHeightSub1F, 0x0); // vWCoord >= vSrcWidthF - vrangeps(vCoordDst, vCoordDst, vZeros, 0x1); // vWCoord < vZeros + vrangeps(vCoordDst, + vCoordOrigin, + dim == coord::w ? vSrcWidthSub1F : vSrcHeightSub1F, + 0x0); // vWCoord >= vSrcWidthF + vrangeps(vCoordDst, vCoordDst, vZeros, 0x1); // vWCoord < vZeros } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { auto rAux = getReg64(); auto vAux = getVmm(); @@ -836,7 +843,7 @@ void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor uni_vaddps(vCoordDst, vCoordDst, vAux); if (vZeros.isInitialized()) { - uni_vcmpps(vAux, vCoordDst, vZeros, 0x6); // vCoord >= vZeros + uni_vcmpps(vAux, vCoordDst, vZeros, 0x6); // vCoord >= vZeros } else { if (isa == x64::sse41) { if (!vAux1.isInitialized()) { @@ -844,27 +851,29 @@ void GridSampleKernel::borderPadding(const Vmm& vCoordDst, const Vmm& vCoor vSub1F = vAux1; } uni_vpxor(vSub1F, vSub1F, vSub1F); - uni_vcmpps(vAux, vCoordDst, vSub1F, 0x6); // vCoord >= vZeros + uni_vcmpps(vAux, vCoordDst, vSub1F, 0x6); // vCoord >= vZeros } else { uni_vpxor(vAux, vAux, vAux); - uni_vcmpps(vAux, vCoordDst, vAux, 0x6); // vCoord >= vZeros + uni_vcmpps(vAux, vCoordDst, vAux, 0x6); // vCoord >= vZeros } } uni_vandps(vCoordDst, vCoordDst, vAux); } template <> -void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { +void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, + const Vmm& vCoordOrigin, + const coord dim) { auto vAux = getVmm(); auto kAux = getMask(); const auto& vSrcDimMul2Sub1F = dim == coord::w ? vSrcWidthMul2Sub1F : vSrcHeightMul2Sub1F; if (jcp.alignCorners) { // abs(x) % D21 - uni_vandps(vCoordDst, vCoordOrigin, vAbsMask); // abs(x) + uni_vandps(vCoordDst, vCoordOrigin, vAbsMask); // abs(x) uni_vdivps(vAux, vCoordDst, vSrcDimMul2Sub1F); - uni_vroundps(vAux, vAux, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21 + uni_vroundps(vAux, vAux, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2Sub1F); // abs(x) % D21 // Check that the result does not exceed the divisor. vcmpps(kAux, vSrcDimMul2Sub1F, vCoordDst, CMP_LE_PS); @@ -876,12 +885,12 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, if (vCoordDst.getIdx() != vCoordOrigin.getIdx()) uni_vmovups(vCoordDst, vCoordOrigin); uni_vdivps(vAux, vCoordDst, vSrcDimMul2F); - uni_vroundps(vAux, vAux, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // x % D2 - uni_vaddps(vCoordDst, vCoordDst, vSrcDimMul2F); // x % D2 + D2 + uni_vroundps(vAux, vAux, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // x % D2 + uni_vaddps(vCoordDst, vCoordDst, vSrcDimMul2F); // x % D2 + D2 uni_vdivps(vAux, vCoordDst, vSrcDimMul2F); - uni_vroundps(vAux, vAux, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // (x % D2 + D2) % D2 + uni_vroundps(vAux, vAux, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux, vSrcDimMul2F); // (x % D2 + D2) % D2 // Check that the result does not exceed the divisor. vcmpps(kAux, vSrcDimMul2F, vCoordDst, CMP_LE_PS); @@ -890,13 +899,13 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, } uni_vsubps(vAux, vSrcDimMul2Sub1F, vCoordDst); - vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, CMP_LE_PS); // vCoordDst >= vSrcDimF + vcmpps(kAux, dim == coord::w ? vSrcWidthF : vSrcHeightF, vCoordDst, CMP_LE_PS); // vCoordDst >= vSrcDimF uni_vmovups(vCoordDst | kAux, vAux); } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& vCoordOrigin, const coord dim) { - auto rAux = getReg64(); + auto rAux = getReg64(); auto vAux0 = getVmm(); auto vAux1 = getVmm(); @@ -904,14 +913,15 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v // D21 = (Dim - 1) * 2 if (jcp.alignCorners) { // x' = abs(x) % D21 - D21 - static const unsigned absMask[8] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff }; - if (isa ==x64::sse41) { - static const unsigned *absPtr = absMask + (reinterpret_cast(absMask) % 16) / sizeof(unsigned); + static const unsigned absMask[8] = + {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; + if (isa == x64::sse41) { + static const unsigned* absPtr = absMask + (reinterpret_cast(absMask) % 16) / sizeof(unsigned); mov(rAux, reinterpret_cast(absPtr)); } else { mov(rAux, reinterpret_cast(absMask)); } - uni_vandps(vCoordDst, vCoordOrigin, ptr[rAux]); // abs(x) + uni_vandps(vCoordDst, vCoordOrigin, ptr[rAux]); // abs(x) Vmm vMul2Sub1; if (dim == coord::w) { @@ -932,8 +942,8 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v } } uni_vdivps(vAux0, vCoordDst, vMul2Sub1); - uni_vroundps(vAux0, vAux0, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux0, vMul2Sub1); // abs(x) % D21 + uni_vroundps(vAux0, vAux0, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux0, vMul2Sub1); // abs(x) % D21 // Check that the result does not exceed the divisor. uni_vcmpps(vAux0, vCoordDst, vMul2Sub1, CMP_LT_PS); @@ -942,7 +952,7 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v uni_vcmpps(vAux0, vAux0, vCoordDst, CMP_LE_PS); uni_vandps(vCoordDst, vCoordDst, vAux0); - uni_vsubps(vAux0, vCoordDst, vMul2Sub1); // abs(x) % D21 - D21 + uni_vsubps(vAux0, vCoordDst, vMul2Sub1); // abs(x) % D21 - D21 } else { // x' = (x % D2 + D2) % D2 - D21 if (vCoordDst.getIdx() != vCoordOrigin.getIdx()) @@ -966,12 +976,12 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v } } uni_vdivps(vAux0, vCoordOrigin, vMul2); - uni_vroundps(vAux0, vAux0, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // x % D2 - uni_vaddps(vCoordDst, vCoordDst, vMul2); // x % D2 + D2 + uni_vroundps(vAux0, vAux0, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // x % D2 + uni_vaddps(vCoordDst, vCoordDst, vMul2); // x % D2 + D2 uni_vdivps(vAux0, vCoordDst, vMul2); - uni_vroundps(vAux0, vAux0, 0x3); // Truncation - uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // (x % D2 + D2) % D2 + uni_vroundps(vAux0, vAux0, 0x3); // Truncation + uni_vfnmadd231ps(vCoordDst, vAux0, vMul2); // (x % D2 + D2) % D2 // Check that the result does not exceed the divisor. uni_vcmpps(vAux0, vCoordDst, vMul2, CMP_LT_PS); @@ -1002,20 +1012,20 @@ void GridSampleKernel::reflectionPadding(const Vmm& vCoordDst, const Vmm& v uni_vcmpps(vAux1, vCoordDst, vSrcWidthF, CMP_LT_PS); // vCoordDst < vUpperBound } else { mov(rAux, ptr[regParams + GET_OFF(srcWidthF)]); - uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound } } else { if (vSrcHeightF.isInitialized()) { - uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, CMP_LT_PS); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, vSrcHeightF, CMP_LT_PS); // vCoordDst < vUpperBound } else { mov(rAux, ptr[regParams + GET_OFF(srcHeightF)]); - uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound + uni_vcmpps(vAux1, vCoordDst, ptr[rAux], CMP_LT_PS); // vCoordDst < vUpperBound } } uni_vandps(vCoordDst, vCoordDst, vAux1); uni_vandnps(vAux1, vAux1, vAux0); - uni_vsubps(vCoordDst, vCoordDst, vAux1); // set -x' for vCoordDst >= Dim + uni_vsubps(vCoordDst, vCoordDst, vAux1); // set -x' for vCoordDst >= Dim } template <> @@ -1045,12 +1055,13 @@ void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, c template <> void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vmm& vDDim, const uint8_t idx) { - static const size_t elPerVec = x64::cpu_isa_traits::vlen / sizeof(float);; - static const float const_0_75[elPerVec] = { -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f }; - static const float const_1_25[elPerVec] = { 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f }; - static const float const_1_50[elPerVec] = { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f }; - static const float const_2_00[elPerVec] = { 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f }; - static const float const_2_25[elPerVec] = { 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f }; + static const size_t elPerVec = x64::cpu_isa_traits::vlen / sizeof(float); + ; + static const float const_0_75[elPerVec] = {-0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f}; + static const float const_1_25[elPerVec] = {1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f}; + static const float const_1_50[elPerVec] = {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}; + static const float const_2_00[elPerVec] = {2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}; + static const float const_2_25[elPerVec] = {2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f}; auto rAux = getReg64(); @@ -1088,11 +1099,11 @@ void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vm template <> void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vmm& vDDim, const uint8_t idx) { static const size_t elPerVec = x64::cpu_isa_traits::vlen / sizeof(float); - static const float const_0_75[elPerVec] = { -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f }; - static const float const_1_25[elPerVec] = { 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f }; - static const float const_1_50[elPerVec] = { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f }; - static const float const_2_00[elPerVec] = { 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f }; - static const float const_2_25[elPerVec] = { 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f }; + static const float const_0_75[elPerVec] = {-0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f}; + static const float const_1_25[elPerVec] = {1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f}; + static const float const_1_50[elPerVec] = {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}; + static const float const_2_00[elPerVec] = {2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}; + static const float const_2_25[elPerVec] = {2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f}; auto rAux = getReg64(); auto vAux = getVmm(); @@ -1136,11 +1147,11 @@ template <> void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const Vmm& vDDim, const uint8_t idx) { static const size_t elToAllocate = 2 * x64::cpu_isa_traits::vlen / sizeof(float); // Allocation with a margin for address alignment. - static const float c_0_75[elToAllocate] = { -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f }; - static const float c_1_25[elToAllocate] = { 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f }; - static const float c_1_50[elToAllocate] = { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f }; - static const float c_2_00[elToAllocate] = { 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f }; - static const float c_2_25[elToAllocate] = { 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f }; + static const float c_0_75[elToAllocate] = {-0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f, -0.75f}; + static const float c_1_25[elToAllocate] = {1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f, 1.25f}; + static const float c_1_50[elToAllocate] = {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}; + static const float c_2_00[elToAllocate] = {2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f}; + static const float c_2_25[elToAllocate] = {2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f, 2.25f}; // Address alignment for XMM. static const float* const_0_75 = c_0_75 + (reinterpret_cast(c_0_75) % 16) / sizeof(float); static const float* const_1_25 = c_1_25 + (reinterpret_cast(c_1_25) % 16) / sizeof(float); @@ -1193,15 +1204,15 @@ void GridSampleKernel::bicubicCoefficients(const Vmm& vCoef, const V } } -template // Works for AVX512, AVX2, AVX, SSE41 +template // Works for AVX512, AVX2, AVX, SSE41 void GridSampleKernel::nearestInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { const auto& vSrcShift = vWCoord; - const auto& vAux = vHCoord; - auto kGatherMask = getMask(); - auto kAuxMask = getMask(); + const auto& vAux = vHCoord; + auto kGatherMask = getMask(); + auto kAuxMask = getMask(); - uni_vroundps(vWCoord, vWCoord, 0x0); // Round near - uni_vroundps(vHCoord, vHCoord, 0x0); // Round near + uni_vroundps(vWCoord, vWCoord, 0x0); // Round near + uni_vroundps(vHCoord, vHCoord, 0x0); // Round near bool useMask = false, zeroFill = false; if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { @@ -1272,15 +1283,15 @@ template <> void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { const auto& vDX = vWCoord; const auto& vDY = vHCoord; - auto shift00 = getVmm(); - auto shift01 = getVmm(); - auto shift10 = getVmm(); - auto shift11 = getVmm(); - auto vAux = getVmm(); + auto shift00 = getVmm(); + auto shift01 = getVmm(); + auto shift10 = getVmm(); + auto shift11 = getVmm(); + auto vAux = getVmm(); RegistersPool::Reg kMask00, kMask01, kMask10, kMask11; - uni_vroundps(shift00, vWCoord, 0x1); // Round floor - uni_vroundps(shift01, vHCoord, 0x1); // Round floor + uni_vroundps(shift00, vWCoord, 0x1); // Round floor + uni_vroundps(shift01, vHCoord, 0x1); // Round floor uni_vsubps(vDX, vWCoord, shift00); uni_vsubps(vDY, vHCoord, shift01); uni_vaddps(shift10, shift00, vOnesF); @@ -1294,10 +1305,10 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor kMask10 = getMask(); kMask11 = getMask(); - zerosPadding(kMask00, shift01, shift00); // (y; x) - zerosPadding(kMask01, shift01, shift10); // (y; x + 1) - zerosPadding(kMask11, shift11, shift10); // (y + 1; x + 1) - zerosPadding(kMask10, shift11, shift00); // (y + 1; x) + zerosPadding(kMask00, shift01, shift00); // (y; x) + zerosPadding(kMask01, shift01, shift10); // (y; x + 1) + zerosPadding(kMask11, shift11, shift10); // (y + 1; x + 1) + zerosPadding(kMask10, shift11, shift00); // (y + 1; x) hwShiftPs2dq(shift00, shift01, shift00, vSrcWidthF); uni_vpaddd(shift01, shift00, vDataTypeSizeB); @@ -1330,8 +1341,8 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; RegistersPool::Reg rChannel; - auto rSrcTmp = getReg64(); - auto rDstTmp = getReg64(); + auto rSrcTmp = getReg64(); + auto rDstTmp = getReg64(); mov(rSrcTmp, regSrc); mov(rDstTmp, regDst); @@ -1349,11 +1360,11 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { kmovw(kAuxMask, kMask00); } - gatherdd(vQ0, rSrcTmp, shift00, kAuxMask, useMask, zeroFill); // v00 -> vQ0 + gatherdd(vQ0, rSrcTmp, shift00, kAuxMask, useMask, zeroFill); // v00 -> vQ0 if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vQ0, vQ0); } - uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) + uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) // (y; x + 1) if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { @@ -1363,7 +1374,7 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vAux, vAux); } - uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 + uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 // (y + 1; x + 1) if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { @@ -1383,14 +1394,14 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor uni_vcvtdq2ps(vQ1, vQ1); } - uni_vfmsub213ps(vQ1, vDX, vQ1); // q1 = -(v10 - dx * v10) - uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 + uni_vfmsub213ps(vQ1, vDX, vQ1); // q1 = -(v10 - dx * v10) + uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 // Res = q0 + dy * (q1 - q0) uni_vsubps(vQ1, vQ1, vQ0); uni_vfmadd132ps(vQ1, vQ0, vDY); if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vQ1, vQ1, 0x3); // Truncation + uni_vroundps(vQ1, vQ1, 0x3); // Truncation uni_vcvtps2dq(vQ1, vQ1); } @@ -1410,20 +1421,20 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoor } } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { auto vWRound = getVmm(); auto vHRound = getVmm(); - auto& vDX = vWCoord; - auto& vDY = vHCoord; - auto vAux = getVmm(); + auto& vDX = vWCoord; + auto& vDY = vHCoord; + auto vAux = getVmm(); Vmm shift00, shift01, shift10, shift11; RegistersPool::Reg shift10Holder, shift11Holder; // For ZEROS padding only. RegistersPool::Reg vMask00, vMask01, vMask10, vMask11; - uni_vroundps(vWRound, vWCoord, 0x1); // Round floor - uni_vroundps(vHRound, vHCoord, 0x1); // Round floor + uni_vroundps(vWRound, vWCoord, 0x1); // Round floor + uni_vroundps(vHRound, vHCoord, 0x1); // Round floor uni_vsubps(vDX, vDX, vWRound); uni_vsubps(vDY, vDY, vHRound); @@ -1444,9 +1455,9 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& useMask = zeroFill = true; { auto rAux = getReg64(); - static const float onesArr[8] = { 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f }; - if (isa ==x64::sse41) { - static const float *onesPtr = onesArr + (reinterpret_cast(onesArr) % 16) / sizeof(float); + static const float onesArr[8] = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f}; + if (isa == x64::sse41) { + static const float* onesPtr = onesArr + (reinterpret_cast(onesArr) % 16) / sizeof(float); mov(rAux, reinterpret_cast(onesPtr)); } else { mov(rAux, reinterpret_cast(onesArr)); @@ -1463,10 +1474,10 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& uni_vaddps(vMask00, vWRound, vAux); uni_vaddps(vAux, vAux, vHRound); - zerosPadding(vMask01, vHRound, vMask00); // (y; x + 1) - zerosPadding(vMask10, vAux, vWRound); // (y + 1; x) - zerosPadding(vMask11, vAux, vMask00); // (y + 1; x + 1) - zerosPadding(vMask00, vHRound, vWRound); // (y; x) + zerosPadding(vMask01, vHRound, vMask00); // (y; x + 1) + zerosPadding(vMask10, vAux, vWRound); // (y + 1; x) + zerosPadding(vMask11, vAux, vMask00); // (y + 1; x + 1) + zerosPadding(vMask00, vHRound, vWRound); // (y; x) hwShiftPs2dq(shift00, vHRound, vWRound, vSrcWidthF); } else if (jcp.paddingMode == GridSamplePaddingMode::BORDER) { @@ -1490,17 +1501,17 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& } auto vGatherMask = getVmm(); - auto vQ0 = getVmm(); - auto vQ1 = getVmm(); + auto vQ0 = getVmm(); + auto vQ1 = getVmm(); // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; RegistersPool::Reg rChannel; - auto rSrcTmp = getReg64(); - auto rDstTmp = getReg64(); + auto rSrcTmp = getReg64(); + auto rDstTmp = getReg64(); auto rTypeSize = getReg64(); - mov(rSrcTmp, regSrc); - mov(rDstTmp, regDst); + mov(rSrcTmp, regSrc); + mov(rDstTmp, regDst); mov(rTypeSize, ptr[regParams + GET_OFF(dataTypeSize)]); for (uint64_t ch = 0; ch < jcp.cannelNum; ch++) { @@ -1517,12 +1528,17 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (jcp.paddingMode == GridSamplePaddingMode::ZEROS && isa == x64::avx2) { uni_vmovups(vGatherMask, vMask00); } - gatherdd(vQ0, rSrcTmp, shift00, (isa == x64::avx2 || !vMask00.isInitialized()) ? vGatherMask : vMask00, useMask, zeroFill); // v00 -> vQ0 + gatherdd(vQ0, + rSrcTmp, + shift00, + (isa == x64::avx2 || !vMask00.isInitialized()) ? vGatherMask : vMask00, + useMask, + zeroFill); // v00 -> vQ0 if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vQ0, vQ0); } if (isa == x64::avx2) { - uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) + uni_vfmsub213ps(vQ0, vDX, vQ0); // q0 = -(v00 - dx * v00) } else { uni_vmulps(vGatherMask, vQ0, vDX); uni_vsubps(vQ0, vQ0, vGatherMask); @@ -1534,13 +1550,17 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (isa == x64::avx2) uni_vmovups(vGatherMask, vMask01); } - gatherdd(vAux, rSrcTmp, jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift01 : shift10, - (isa == x64::avx2 || !vMask01.isInitialized()) ? vGatherMask : vMask01, useMask, zeroFill); + gatherdd(vAux, + rSrcTmp, + jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift01 : shift10, + (isa == x64::avx2 || !vMask01.isInitialized()) ? vGatherMask : vMask01, + useMask, + zeroFill); if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vAux, vAux); } if (isa == x64::avx2) { - uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 + uni_vfmsub231ps(vQ0, vAux, vDX); // q0 = -q0 + dx * v01 } else { uni_vmulps(vAux, vAux, vDX); uni_vaddps(vQ0, vQ0, vAux); @@ -1556,8 +1576,12 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (isa == x64::avx2) uni_vmovups(vGatherMask, vMask11); } - gatherdd(vAux, rSrcTmp, jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift11 : shift10, - (isa == x64::avx2 || !vMask11.isInitialized()) ? vGatherMask : vMask11, useMask, zeroFill); + gatherdd(vAux, + rSrcTmp, + jcp.paddingMode != GridSamplePaddingMode::ZEROS ? shift11 : shift10, + (isa == x64::avx2 || !vMask11.isInitialized()) ? vGatherMask : vMask11, + useMask, + zeroFill); if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vAux, vAux); } @@ -1568,7 +1592,12 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& if (isa == x64::avx2) uni_vmovups(vGatherMask, vMask10); } - gatherdd(vQ1, rSrcTmp, shift10, (isa == x64::avx2 || !vMask10.isInitialized()) ? vGatherMask : vMask10, useMask, zeroFill); + gatherdd(vQ1, + rSrcTmp, + shift10, + (isa == x64::avx2 || !vMask10.isInitialized()) ? vGatherMask : vMask10, + useMask, + zeroFill); if (jcp.inDataPrc == ov::element::i32) { uni_vcvtdq2ps(vQ1, vQ1); } @@ -1585,13 +1614,13 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& uni_vmovups(vQ1, vGatherMask); } } - uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 + uni_vfmsub231ps(vQ1, vAux, vDX); // q1 = -q1 + dx * v11 // Res = q0 + dy * (q1 - q0) uni_vsubps(vQ1, vQ1, vQ0); uni_vfmadd132ps(vQ1, vQ0, vDY); if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vQ1, vQ1, 0x3); // Truncation + uni_vroundps(vQ1, vQ1, 0x3); // Truncation uni_vcvtps2dq(vQ1, vQ1); } @@ -1614,27 +1643,27 @@ void GridSampleKernel::bilinearInterpolation(const Vmm& vWCoord, const Vmm& template <> void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { - auto vHTop = getVmm(); - auto vWLeft = getVmm(); - auto vDX = getVmm(); - auto vDY = getVmm(); - auto vXDotProd = getVmm(); + auto vHTop = getVmm(); + auto vWLeft = getVmm(); + auto vDX = getVmm(); + auto vDY = getVmm(); + auto vXDotProd = getVmm(); auto& vYDotProd = vDX; auto vSrcShift0 = getVmm(); - auto vSrcShift = getVmm(); - auto vAux = getVmm(); - auto kAuxMask = getMask(); + auto vSrcShift = getVmm(); + auto vAux = getVmm(); + auto kAuxMask = getMask(); RegistersPool::Reg kMaskH; std::vector> wMasks; - uni_vroundps(vHTop, vHCoord, 0x1); // Round floor - uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor + uni_vroundps(vHTop, vHCoord, 0x1); // Round floor + uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor uni_vsubps(vDY, vHCoord, vHTop); uni_vsubps(vDX, vWCoord, vWLeft); uni_vsubps(vHTop, vHTop, vOnesF); uni_vsubps(vWLeft, vWLeft, vOnesF); - RegistersPool::Reg vCX[4] = {getVmm(), getVmm(), getVmm(), getVmm() }; + RegistersPool::Reg vCX[4] = {getVmm(), getVmm(), getVmm(), getVmm()}; for (int i = 0; i < 4; i++) { bicubicCoefficients(vCX[i], vDX, i); } @@ -1659,8 +1688,8 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; RegistersPool::Reg rChannel; - auto rSrcTmp = getReg64(); - auto rDstTmp = getReg64(); + auto rSrcTmp = getReg64(); + auto rDstTmp = getReg64(); mov(rSrcTmp, regSrc); mov(rDstTmp, regDst); @@ -1742,7 +1771,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord } if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation + uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation uni_vcvtps2dq(vYDotProd, vYDotProd); } @@ -1762,15 +1791,15 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord } } -template // Works for AVX2, AVX, SSE41 +template // Works for AVX2, AVX, SSE41 void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& vHCoord, bool tail) { - auto vHTop = getVmm(); + auto vHTop = getVmm(); auto vWLeft = getVmm(); - auto vDX = getVmm(); - auto vDY = getVmm(); + auto vDX = getVmm(); + auto vDY = getVmm(); - uni_vroundps(vHTop, vHCoord, 0x1); // Round floor - uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor + uni_vroundps(vHTop, vHCoord, 0x1); // Round floor + uni_vroundps(vWLeft, vWCoord, 0x1); // Round floor uni_vsubps(vDY, vHCoord, vHTop); uni_vsubps(vDX, vWCoord, vWLeft); uni_vsubps(vHTop, vHTop, vOnesF); @@ -1791,7 +1820,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } auto vW0 = getVmm(), vW1 = getVmm(); - Vmm vW[4] = { vW0, vW1, vHCoord, vWCoord }; + Vmm vW[4] = {vW0, vW1, vHCoord, vWCoord}; for (int w = 0; w < 4; w++) { borderPadding(vW[w], vWLeft, coord::w); if (w < 3) { @@ -1806,7 +1835,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& mov(rAux, ptr[regParams + GET_OFF(srcHeightSub1F)]); uni_vmovups(vSrcHeightSub1F, ptr[rAux]); } - auto vH = getVmm(); + auto vH = getVmm(); size_t bufShift = 0lu; for (int h = 0; h < 4; h++) { @@ -1839,7 +1868,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } auto vW0 = getVmm(), vW1 = getVmm(); - Vmm vW[4] = { vW0, vW1, vHCoord, vWCoord }; + Vmm vW[4] = {vW0, vW1, vHCoord, vWCoord}; for (int w = 0; w < 4; w++) { reflectionPadding(vW[w], vWLeft, coord::w); if (w < 3) { @@ -1860,7 +1889,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& mov(rAux, ptr[regParams + GET_OFF(srcHeightMul2Sub1F)]); uni_vmovups(vSrcHeightMul2Sub1F, ptr[rAux]); } - auto vH = getVmm(); + auto vH = getVmm(); size_t bufShift = 0lu; for (int h = 0; h < 4; h++) { @@ -1883,7 +1912,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } else if (jcp.paddingMode == GridSamplePaddingMode::ZEROS) { useMask = zeroFill = true; - RegistersPool::Reg vWMask[4] = { getVmm(), getVmm(), getVmm(), getVmm() }; + RegistersPool::Reg vWMask[4] = {getVmm(), getVmm(), getVmm(), getVmm()}; for (int w = 0; w < 4; w++) { if (w == 0) { zerosPaddingW(vWMask[w], vWLeft); @@ -1933,21 +1962,21 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& vDataTypeSizeB.release(); } - RegistersPool::Reg vCX[4] = { getVmm(), getVmm(), getVmm(), getVmm() }; + RegistersPool::Reg vCX[4] = {getVmm(), getVmm(), getVmm(), getVmm()}; for (int w = 0; w < 4; w++) { bicubicCoefficients(vCX[w], vDX, w); } auto vCY0 = getVmm(), vCY1 = getVmm(); - Vmm vCY[4] = { vCY0, vCY1, vHCoord, vWCoord }; + Vmm vCY[4] = {vCY0, vCY1, vHCoord, vWCoord}; for (int h = 0; h < 4; h++) { bicubicCoefficients(vCY[h], vDY, h); } const auto& vXDotProd = vDX; const auto& vYDotProd = vDY; - auto vSrcShift = getVmm(); + auto vSrcShift = getVmm(); auto kGatherMask = getVmm(); - auto vAux = getVmm(); + auto vAux = getVmm(); // PER CHANNEL LOOP Xbyak::Label lChannelLoopBegin, lChannelLoopEnd; @@ -2003,7 +2032,7 @@ void GridSampleKernel::bicubicInterpolation(const Vmm& vWCoord, const Vmm& } if (jcp.inDataPrc == ov::element::i32) { - uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation + uni_vroundps(vYDotProd, vYDotProd, 0x3); // Truncation uni_vcvtps2dq(vYDotProd, vYDotProd); } @@ -2028,7 +2057,7 @@ void GridSampleKernel::dataTypeShiftPs2Dq(const Vmm& vDst, const Vmm& vSrc) if (dataTypeSize == 1) return; - if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM + if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM auto rAux = getReg64(); static const float val = dataTypeSize; static const float dataTypeSizeArr[8] = {val, val, val, val, val, val, val, val}; @@ -2038,7 +2067,7 @@ void GridSampleKernel::dataTypeShiftPs2Dq(const Vmm& vDst, const Vmm& vSrc) } else { uni_vcvtps2dq(vDst, vSrc); if (dataTypeSize > 1) - uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. + uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. } } @@ -2066,7 +2095,7 @@ void GridSampleKernel::hwShiftPs2dq(const Vmm& vDst, const Vmm& vHCoord, co } } - if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM + if (isa == x64::avx) { // vpslld works just with XMM for AVX, so use vmulps for YMM if (dataTypeSize > 1) { auto rAux = getReg64(); const float val = dataTypeSize; @@ -2078,7 +2107,7 @@ void GridSampleKernel::hwShiftPs2dq(const Vmm& vDst, const Vmm& vHCoord, co } else { uni_vcvtps2dq(vDst, vDst); if (dataTypeSize > 1) - uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. + uni_vpslld(vDst, vDst, dataTypeShift); // multiply by source data type size. } } @@ -2086,6 +2115,6 @@ template class GridSampleKernel; template class GridSampleKernel; template class GridSampleKernel; -} // namespace kernel -} // namespace intel_cpu -} // namespace ov +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp index cb13d62c3509d1..f276580a837bd2 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/grid_sample.hpp @@ -4,9 +4,10 @@ #pragma once -#include "jit_kernel_base.hpp" #include +#include "jit_kernel_base.hpp" + namespace ov { namespace intel_cpu { @@ -20,16 +21,16 @@ class GridSampleKernelBase; #if defined(OPENVINO_ARCH_X86_64) struct GridSampleKernelConfParams { - bool dynamicShapes = false; - bool dynamicBatch = false; + bool dynamicShapes = false; + bool dynamicBatch = false; bool dynamicChannel = false; - bool alignCorners = false; + bool alignCorners = false; GridSampleInterpolationMode interpolationMode = GridSampleInterpolationMode::BILINEAR; GridSamplePaddingMode paddingMode = GridSamplePaddingMode::ZEROS; ov::element::Type inDataPrc; ov::element::Type gridPrc; - uint64_t batchNum = 1lu; - uint64_t cannelNum = 1lu; + uint64_t batchNum = 1lu; + uint64_t cannelNum = 1lu; uint64_t srcBatchStepB = 0lu; }; @@ -37,13 +38,13 @@ struct GridSamplesKernelExecArgs { const void* src; const void* grid; void* dst; - uint64_t batchNum = 1lu; + uint64_t batchNum = 1lu; uint64_t channelsNum = 1lu; const float* srcWidthF; const float* srcHeightF; - uint64_t srcBatchStepB = 0lu; - uint64_t gridBatchStepB = 0lu; - uint64_t dstBatchStepB = 0lu; + uint64_t srcBatchStepB = 0lu; + uint64_t gridBatchStepB = 0lu; + uint64_t dstBatchStepB = 0lu; uint64_t srcChannelStepB = 0lu; uint64_t dstChannelStepB = 0lu; const void* wDenormCoefF; @@ -60,19 +61,21 @@ struct GridSamplesKernelExecArgs { uint64_t workAmount = 0lu; }; -enum coord { - w, h -}; +enum coord { w, h }; -class GridSampleKernelBase: public JitKernelBase { +class GridSampleKernelBase : public JitKernelBase { public: - void (*ker_)(const GridSamplesKernelExecArgs *); - void operator()(const GridSamplesKernelExecArgs *args) { + void (*ker_)(const GridSamplesKernelExecArgs*); + void operator()(const GridSamplesKernelExecArgs* args) { assert(ker_); ker_(args); } - explicit GridSampleKernelBase(const char* name, const GridSampleKernelConfParams& jcp, dnnl::impl::cpu::x64::cpu_isa_t isa) - : JitKernelBase(name, isa), ker_(nullptr), jcp(jcp) {} + explicit GridSampleKernelBase(const char* name, + const GridSampleKernelConfParams& jcp, + dnnl::impl::cpu::x64::cpu_isa_t isa) + : JitKernelBase(name, isa), + ker_(nullptr), + jcp(jcp) {} virtual void create_ker() = 0; uint64_t getVecLen() { @@ -87,7 +90,7 @@ class GridSampleKernelBase: public JitKernelBase { protected: GridSampleKernelConfParams jcp; - uint64_t vlen = 16lu; + uint64_t vlen = 16lu; uint64_t dataTypeSize = 1lu; uint64_t gridTypeSize = 1lu; uint64_t dataElPerVec = 1lu; @@ -104,12 +107,16 @@ class GridSampleKernel : public GridSampleKernelBase { void create_ker() override; void generate() override; - using Vmm = typename dnnl::impl::utils::conditional3::type; - using Vmask = typename dnnl::impl::utils::conditional3::type; + using Vmm = typename dnnl::impl::utils::conditional3::type; + using Vmask = typename dnnl::impl::utils::conditional3::type; private: uint8_t dataTypeShift = 0; @@ -138,23 +145,23 @@ class GridSampleKernel : public GridSampleKernelBase { RegistersPool::Reg vWDenormCoefF; RegistersPool::Reg vHDenormCoefF; RegistersPool::Reg vGridPermMask; - RegistersPool::Reg vDataTypeSizeB; // for ZEROS padding - RegistersPool::Reg vSrcWidthB; // for ZEROS padding + RegistersPool::Reg vDataTypeSizeB; // for ZEROS padding + RegistersPool::Reg vSrcWidthB; // for ZEROS padding - RegistersPool::Reg vSrcHeightSub1F; // for BORDER padding - RegistersPool::Reg vSrcWidthSub1F; // for BORDER padding + RegistersPool::Reg vSrcHeightSub1F; // for BORDER padding + RegistersPool::Reg vSrcWidthSub1F; // for BORDER padding - RegistersPool::Reg vSrcHeightMul2F; // for REFLECTION padding - RegistersPool::Reg vSrcWidthMul2F; // for REFLECTION padding - RegistersPool::Reg vSrcHeightMul2Sub1F; // for REFLECTION padding - RegistersPool::Reg vSrcWidthMul2Sub1F; // for REFLECTION padding - RegistersPool::Reg vAbsMask; // for REFLECTION padding + RegistersPool::Reg vSrcHeightMul2F; // for REFLECTION padding + RegistersPool::Reg vSrcWidthMul2F; // for REFLECTION padding + RegistersPool::Reg vSrcHeightMul2Sub1F; // for REFLECTION padding + RegistersPool::Reg vSrcWidthMul2Sub1F; // for REFLECTION padding + RegistersPool::Reg vAbsMask; // for REFLECTION padding - RegistersPool::Reg vConst_0_75; // for BICUBIC interpolation - RegistersPool::Reg vConst_1_25; // for BICUBIC interpolation - RegistersPool::Reg vConst_1_50; // for BICUBIC interpolation - RegistersPool::Reg vConst_2_00; // for BICUBIC interpolation - RegistersPool::Reg vConst_2_25; // for BICUBIC interpolation + RegistersPool::Reg vConst_0_75; // for BICUBIC interpolation + RegistersPool::Reg vConst_1_25; // for BICUBIC interpolation + RegistersPool::Reg vConst_1_50; // for BICUBIC interpolation + RegistersPool::Reg vConst_2_00; // for BICUBIC interpolation + RegistersPool::Reg vConst_2_25; // for BICUBIC interpolation void initVectors(); void process(); @@ -179,8 +186,8 @@ class GridSampleKernel : public GridSampleKernelBase { void hwShiftPs2dq(const Vmm& vDst, const Vmm& vHCoord, const Vmm& vWCoord, const Vmm& vWidth); }; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_X86_64 -} // namespace kernel -} // namespace intel_cpu -} // namespace ov +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp index cd8b32d9ad2a38..2eb981007f2217 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.cpp @@ -3,9 +3,10 @@ // #include "jit_kernel.hpp" -#include -#include + #include +#include +#include #include using namespace dnnl::impl; @@ -17,16 +18,16 @@ namespace intel_cpu { namespace { -template +template using registers = std::array, 16>; bool isRegAllocable(int id) { - return id != abi_param1.getIdx() // function argument - && id != Operand::Code::RSP; // stack pointer + return id != abi_param1.getIdx() // function argument + && id != Operand::Code::RSP; // stack pointer } -template -const RegType & reserveReg(jit_kernel::reg_indices & freeRegs, const registers & regs) { +template +const RegType& reserveReg(jit_kernel::reg_indices& freeRegs, const registers& regs) { if (freeRegs.empty()) throw std::runtime_error("No free registers"); const auto idx = freeRegs.back(); @@ -34,8 +35,8 @@ const RegType & reserveReg(jit_kernel::reg_indices & freeRegs, const registers -void freeReg(jit_kernel::reg_indices & freeRegs, const registers & regs, const RegType & reg) { +template +void freeReg(jit_kernel::reg_indices& freeRegs, const registers& regs, const RegType& reg) { const auto idx = reg.getIdx(); // Debug: // auto it = std::find(freeRegs.begin(), freeRegs.end(), idx); @@ -46,105 +47,189 @@ void freeReg(jit_kernel::reg_indices & freeRegs, const registers & regs OPENVINO_THROW("Some register was freed twice"); } -const registers & x64regs() { +const registers& x64regs() { using namespace Xbyak::util; - static const registers _x64regs {{ - rax, rcx, rdx, rbx, - rsp, rbp, rsi, rdi, - r8, r9, r10, r11, - r12, r13, r14, r15, + static const registers _x64regs{{ + rax, + rcx, + rdx, + rbx, + rsp, + rbp, + rsi, + rdi, + r8, + r9, + r10, + r11, + r12, + r13, + r14, + r15, }}; return _x64regs; } -const registers & x32regs() { +const registers& x32regs() { using namespace Xbyak::util; - static const registers _x32regs {{ - eax, ecx, edx, ebx, - esp, ebp, esi, edi, - r8d, r9d, r10d, r11d, - r12d, r13d, r14d, r15d, + static const registers _x32regs{{ + eax, + ecx, + edx, + ebx, + esp, + ebp, + esi, + edi, + r8d, + r9d, + r10d, + r11d, + r12d, + r13d, + r14d, + r15d, }}; return _x32regs; } -const registers & x16regs() { +const registers& x16regs() { using namespace Xbyak::util; - static const registers _x16regs {{ - ax, cx, dx, bx, - sp, bp, si, di, - r8w, r9w, r10w, r11w, - r12w, r13w, r14w, r15w, + static const registers _x16regs{{ + ax, + cx, + dx, + bx, + sp, + bp, + si, + di, + r8w, + r9w, + r10w, + r11w, + r12w, + r13w, + r14w, + r15w, }}; return _x16regs; } -const registers & x8regs() { +const registers& x8regs() { using namespace Xbyak::util; - static const registers _x8regs {{ - al, cl, dl, bl, - spl, bpl, sil, dil, - r8b, r9b, r10b, r11b, - r12b, r13b, r14b, r15b, + static const registers _x8regs{{ + al, + cl, + dl, + bl, + spl, + bpl, + sil, + dil, + r8b, + r9b, + r10b, + r11b, + r12b, + r13b, + r14b, + r15b, }}; return _x8regs; } -const registers & xmmregs() { - static const registers _xmmregs {{ - Xbyak::util::xmm0, Xbyak::util::xmm1, Xbyak::util::xmm2, Xbyak::util::xmm3, - Xbyak::util::xmm4, Xbyak::util::xmm5, Xbyak::util::xmm6, Xbyak::util::xmm7, - Xbyak::util::xmm8, Xbyak::util::xmm9, Xbyak::util::xmm10, Xbyak::util::xmm11, - Xbyak::util::xmm12, Xbyak::util::xmm13, Xbyak::util::xmm14, Xbyak::util::xmm15, +const registers& xmmregs() { + static const registers _xmmregs{{ + Xbyak::util::xmm0, + Xbyak::util::xmm1, + Xbyak::util::xmm2, + Xbyak::util::xmm3, + Xbyak::util::xmm4, + Xbyak::util::xmm5, + Xbyak::util::xmm6, + Xbyak::util::xmm7, + Xbyak::util::xmm8, + Xbyak::util::xmm9, + Xbyak::util::xmm10, + Xbyak::util::xmm11, + Xbyak::util::xmm12, + Xbyak::util::xmm13, + Xbyak::util::xmm14, + Xbyak::util::xmm15, }}; return _xmmregs; } -const registers & ymmregs() { - static const registers _ymmregs {{ - Xbyak::util::ymm0, Xbyak::util::ymm1, Xbyak::util::ymm2, Xbyak::util::ymm3, - Xbyak::util::ymm4, Xbyak::util::ymm5, Xbyak::util::ymm6, Xbyak::util::ymm7, - Xbyak::util::ymm8, Xbyak::util::ymm9, Xbyak::util::ymm10, Xbyak::util::ymm11, - Xbyak::util::ymm12, Xbyak::util::ymm13, Xbyak::util::ymm14, Xbyak::util::ymm15, +const registers& ymmregs() { + static const registers _ymmregs{{ + Xbyak::util::ymm0, + Xbyak::util::ymm1, + Xbyak::util::ymm2, + Xbyak::util::ymm3, + Xbyak::util::ymm4, + Xbyak::util::ymm5, + Xbyak::util::ymm6, + Xbyak::util::ymm7, + Xbyak::util::ymm8, + Xbyak::util::ymm9, + Xbyak::util::ymm10, + Xbyak::util::ymm11, + Xbyak::util::ymm12, + Xbyak::util::ymm13, + Xbyak::util::ymm14, + Xbyak::util::ymm15, }}; return _ymmregs; } -const registers & zmmregs() { - static const registers _zmmregs {{ - Xbyak::util::zmm0, Xbyak::util::zmm1, Xbyak::util::zmm2, Xbyak::util::zmm3, - Xbyak::util::zmm4, Xbyak::util::zmm5, Xbyak::util::zmm6, Xbyak::util::zmm7, - Xbyak::util::zmm8, Xbyak::util::zmm9, Xbyak::util::zmm10, Xbyak::util::zmm11, - Xbyak::util::zmm12, Xbyak::util::zmm13, Xbyak::util::zmm14, Xbyak::util::zmm15, +const registers& zmmregs() { + static const registers _zmmregs{{ + Xbyak::util::zmm0, + Xbyak::util::zmm1, + Xbyak::util::zmm2, + Xbyak::util::zmm3, + Xbyak::util::zmm4, + Xbyak::util::zmm5, + Xbyak::util::zmm6, + Xbyak::util::zmm7, + Xbyak::util::zmm8, + Xbyak::util::zmm9, + Xbyak::util::zmm10, + Xbyak::util::zmm11, + Xbyak::util::zmm12, + Xbyak::util::zmm13, + Xbyak::util::zmm14, + Xbyak::util::zmm15, }}; return _zmmregs; } -} // namespace +} // namespace namespace internal { -template<> +template <> ov::element::Type type2precision() { return ov::element::f32; } -template<> +template <> ov::element::Type type2precision() { return ov::element::i32; } -template<> +template <> ov::element::Type type2precision() { return ov::element::bf16; } -template<> +template <> ov::element::Type type2precision() { return ov::element::u8; } -template<> +template <> ov::element::Type type2precision() { return ov::element::i8; } @@ -157,27 +242,24 @@ cpu_isa_t get_current_isa() { return cpu_isa_t::sse41; } -stack_frame::stack_frame(ov::intel_cpu::jit_kernel & kernel, size_t size, uint32_t alignment) - : _kernel(kernel) - , _size(size) - , _alignment(alignment) { +stack_frame::stack_frame(ov::intel_cpu::jit_kernel& kernel, size_t size, uint32_t alignment) + : _kernel(kernel), + _size(size), + _alignment(alignment) { if (_size || _alignment) { if (_size && _alignment == 1) { _kernel.sub(_kernel.rsp, _size); } else { auto tmp = _kernel.var(); tmp = _kernel.rsp; - _kernel.sub(_kernel.rsp, sizeof(size_t) + size); // allocate - _kernel.and_(_kernel.rsp, ~(alignment - 1)); // align - _kernel.mov(_kernel.ptr[_kernel.rsp + size], tmp); // remember previous rsp + _kernel.sub(_kernel.rsp, sizeof(size_t) + size); // allocate + _kernel.and_(_kernel.rsp, ~(alignment - 1)); // align + _kernel.mov(_kernel.ptr[_kernel.rsp + size], tmp); // remember previous rsp } } } -stack_frame::stack_frame(stack_frame && rhs) - : _kernel(rhs._kernel) - , _size(rhs._size) - , _alignment(rhs._alignment) { +stack_frame::stack_frame(stack_frame&& rhs) : _kernel(rhs._kernel), _size(rhs._size), _alignment(rhs._alignment) { rhs._size = 0; rhs._alignment = 0; } @@ -192,25 +274,29 @@ stack_frame::~stack_frame() { } } -const Xbyak::Reg64 & stack_frame::pointer() const { +const Xbyak::Reg64& stack_frame::pointer() const { return _kernel.rsp; } void stack_frame::clear() const { const size_t end = _size & ~(size_t)7u; - _kernel.foreach(0, end, [&](const Reg64 & idx) { - _kernel.mov(_kernel.qword[pointer() + idx], 0); - }, sizeof(size_t)); + _kernel.foreach ( + 0, + end, + [&](const Reg64& idx) { + _kernel.mov(_kernel.qword[pointer() + idx], 0); + }, + sizeof(size_t)); if (end < _size) { - _kernel.foreach(end, _size, [&](const Reg64 & idx) { + _kernel.foreach (end, _size, [&](const Reg64& idx) { _kernel.mov(_kernel.byte[pointer() + idx], 0); }); } } -const void * consts_table::store(const void *data, size_t size) { +const void* consts_table::store(const void* data, size_t size) { if (size > chunk_size) throw std::runtime_error("Data size is too large"); const size_t capacity = _chunks.size() * chunk_size; @@ -218,17 +304,16 @@ const void * consts_table::store(const void *data, size_t size) { _size = _chunks.size() * chunk_size; _chunks.emplace_back(); } - auto & dst = _chunks.back(); + auto& dst = _chunks.back(); const size_t offset = _size % chunk_size; memcpy(&dst[offset], data, size); _size += size; return &dst[offset]; } -} // namespace internal +} // namespace internal -jit_kernel::jit_kernel(const char* name) - : jit_generator(name) { +jit_kernel::jit_kernel(const char* name) : jit_generator(name) { _free_rmmregs.reserve(16); _free_rmmregs.reserve(16); @@ -239,73 +324,73 @@ jit_kernel::jit_kernel(const char* name) } } -template<> -const Reg64 & jit_kernel::reserve() { +template <> +const Reg64& jit_kernel::reserve() { return reserveReg(_free_x64regs, x64regs()); } -template<> -const Reg32 & jit_kernel::reserve() { +template <> +const Reg32& jit_kernel::reserve() { return reserveReg(_free_x64regs, x32regs()); } -template<> -const Reg16 & jit_kernel::reserve() { +template <> +const Reg16& jit_kernel::reserve() { return reserveReg(_free_x64regs, x16regs()); } -template<> -const Reg8 & jit_kernel::reserve() { +template <> +const Reg8& jit_kernel::reserve() { return reserveReg(_free_x64regs, x8regs()); } -template<> -void jit_kernel::free(const Reg64 & reg) { +template <> +void jit_kernel::free(const Reg64& reg) { freeReg(_free_x64regs, x64regs(), reg); } -template<> -void jit_kernel::free(const Reg32 & reg) { +template <> +void jit_kernel::free(const Reg32& reg) { freeReg(_free_x64regs, x32regs(), reg); } -template<> -void jit_kernel::free(const Reg16 & reg) { +template <> +void jit_kernel::free(const Reg16& reg) { freeReg(_free_x64regs, x16regs(), reg); } -template<> -void jit_kernel::free(const Reg8 & reg) { +template <> +void jit_kernel::free(const Reg8& reg) { freeReg(_free_x64regs, x8regs(), reg); } -template<> -const Xmm & jit_kernel::reserve() { +template <> +const Xmm& jit_kernel::reserve() { return reserveReg(_free_rmmregs, xmmregs()); } -template<> -void jit_kernel::free(const Xmm & reg) { +template <> +void jit_kernel::free(const Xmm& reg) { freeReg(_free_rmmregs, xmmregs(), reg); } -template<> -const Ymm & jit_kernel::reserve() { +template <> +const Ymm& jit_kernel::reserve() { return reserveReg(_free_rmmregs, ymmregs()); } -template<> -void jit_kernel::free(const Ymm & reg) { +template <> +void jit_kernel::free(const Ymm& reg) { freeReg(_free_rmmregs, ymmregs(), reg); } -template<> -const Zmm & jit_kernel::reserve() { +template <> +const Zmm& jit_kernel::reserve() { return reserveReg(_free_rmmregs, zmmregs()); } -template<> -void jit_kernel::free(const Zmm & reg) { +template <> +void jit_kernel::free(const Zmm& reg) { freeReg(_free_rmmregs, zmmregs(), reg); } @@ -317,26 +402,33 @@ void jit_kernel::postamble() { } } -const AddressFrame & jit_kernel::address_frame(size_t size) const { - switch (size) { - case 1: return byte; - case 2: return word; - case 4: return dword; - case 8: return qword; - case 16: return xword; - case 32: return yword; - case 64: return zword; - default: - break; - } - return ptr; +const AddressFrame& jit_kernel::address_frame(size_t size) const { + switch (size) { + case 1: + return byte; + case 2: + return word; + case 4: + return dword; + case 8: + return qword; + case 16: + return xword; + case 32: + return yword; + case 64: + return zword; + default: + break; + } + return ptr; } -const jit_kernel::reg_indices & jit_kernel::free_x64regs() const { +const jit_kernel::reg_indices& jit_kernel::free_x64regs() const { return _free_x64regs; } -const jit_kernel::reg_indices & jit_kernel::free_rmmregs() const { +const jit_kernel::reg_indices& jit_kernel::free_rmmregs() const { return _free_rmmregs; } @@ -386,5 +478,5 @@ void jit_kernel::uni_vblendps(const Xbyak::Zmm& z1, const Xbyak::Zmm& z2, uint16 vblendmps(z1 | k1, z1, z2); } -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp index 8934bf5dff052b..0073ca91d0b76f 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel.hpp @@ -3,14 +3,15 @@ // #pragma once -#include "cpu/x64/jit_generator.hpp" -#include "emitters/plugin/x64/jit_load_store_emitters.hpp" +#include #include -#include #include -#include -#include #include +#include +#include + +#include "cpu/x64/jit_generator.hpp" +#include "emitters/plugin/x64/jit_load_store_emitters.hpp" namespace ov { namespace intel_cpu { @@ -19,113 +20,103 @@ struct jit_kernel; namespace internal { -template +template struct reg_traits_by_size; -template +template struct reg_traits; -template +template struct reg_traits; -template +template struct isa_traits; -template<> +template <> struct reg_traits_by_size<1> { using type = Xbyak::Reg8; - constexpr static size_t size = 1; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 1; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<2> { using type = Xbyak::Reg16; - constexpr static size_t size = 2; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 2; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<4> { using type = Xbyak::Reg32; - constexpr static size_t size = 4; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 4; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<8> { using type = Xbyak::Reg64; - constexpr static size_t size = 8; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 8; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits_by_size<16> { using type = Xbyak::Xmm; - constexpr static size_t size = 16; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::sse41; + constexpr static size_t size = 16; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::sse41; }; -template<> +template <> struct reg_traits_by_size<32> { using type = Xbyak::Ymm; - constexpr static size_t size = 32; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::avx2; + constexpr static size_t size = 32; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::avx2; }; -template<> +template <> struct reg_traits_by_size<64> { using type = Xbyak::Zmm; - constexpr static size_t size = 64; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::avx512_core; + constexpr static size_t size = 64; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::avx512_core; }; -template +template struct reg_traits : public reg_traits_by_size {}; -template +template struct vec_min_size { - constexpr static size_t size = N <= 16 ? 16 : - N <= 32 ? 32 : - 64; + constexpr static size_t size = N <= 16 ? 16 : N <= 32 ? 32 : 64; }; -template +template struct reg_traits : public reg_traits_by_size::size> {}; -template<> +template <> struct reg_traits { using type = Xbyak::Fpu; - constexpr static size_t size = 10; // in bytes - constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa - = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; + constexpr static size_t size = 10; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa = dnnl::impl::cpu::x64::cpu_isa_t::isa_undef; }; -template<> +template <> struct reg_traits : public reg_traits {}; -template<> +template <> struct isa_traits { struct reg { using type = Xbyak::Xmm; - constexpr static size_t size = 4 * 4; // in bytes - constexpr static size_t length = 4; // in dwords + constexpr static size_t size = 4 * 4; // in bytes + constexpr static size_t length = 4; // in dwords }; }; -template<> +template <> struct isa_traits { struct reg { using type = Xbyak::Ymm; - constexpr static size_t size = 8 * 4; // in bytes - constexpr static size_t length = 8; // in dwords + constexpr static size_t size = 8 * 4; // in bytes + constexpr static size_t length = 8; // in dwords }; }; -template<> +template <> struct isa_traits { struct reg { using type = Xbyak::Zmm; @@ -134,39 +125,39 @@ struct isa_traits { }; }; -template +template class variable; -template +template class if_expression; -template +template class then_expression; -template +template using shared_reg = std::shared_ptr; -template -shared_reg make_shared(Reg & reg, jit_kernel & kernel); +template +shared_reg make_shared(Reg& reg, jit_kernel& kernel); -template +template class boolean_expression { public: using reg_type = const typename reg_traits::type; enum class type { - eq, // == - neq, // != - ls, // < - gt, // > - le, // <= - ge // >= + eq, // == + neq, // != + ls, // < + gt, // > + le, // <= + ge // >= }; - boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, const shared_reg & rhs); - boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, T rhs); + boolean_expression(jit_kernel& kernel, type t, const shared_reg& lhs, const shared_reg& rhs); + boolean_expression(jit_kernel& kernel, type t, const shared_reg& lhs, T rhs); private: - void cmp(const Xbyak::Label & exit) const; + void cmp(const Xbyak::Label& exit) const; - jit_kernel & _kernel; + jit_kernel& _kernel; type _type; shared_reg _lhs; shared_reg _rhs; @@ -176,33 +167,33 @@ class boolean_expression { friend class then_expression; }; -template +template class then_expression { public: - then_expression(if_expression & expr); + then_expression(if_expression& expr); - template - void _else(F && fn); + template + void _else(F&& fn); private: - if_expression & _if_expr; + if_expression& _if_expr; }; -template +template class if_expression { public: - if_expression(const boolean_expression & expr) - : _expr(expr) {} + if_expression(const boolean_expression& expr) : _expr(expr) {} ~if_expression() { try { if (!_is_exit_valid) _expr._kernel.assignL(_exit, _else); - } catch(...) {} + } catch (...) { + } } - template - then_expression _then(F && fn) { + template + then_expression _then(F&& fn) { using namespace Xbyak; _expr.cmp(_else); @@ -214,7 +205,7 @@ class if_expression { } private: - const boolean_expression & _expr; + const boolean_expression& _expr; Xbyak::Label _exit; Xbyak::Label _else; bool _is_exit_valid = false; @@ -222,287 +213,291 @@ class if_expression { friend class then_expression; }; -typedef struct register_tag {} register_tag; -typedef struct memory_tag {} memory_tag; +typedef struct register_tag { +} register_tag; +typedef struct memory_tag { +} memory_tag; -template +template class variable_base; -template +template class variable_base { public: using reg_type = const typename reg_traits::type; - variable_base & operator = (const variable_base &) = delete; + variable_base& operator=(const variable_base&) = delete; - variable_base(const variable_base &); - variable_base(variable_base &&); + variable_base(const variable_base&); + variable_base(variable_base&&); - reg_type & reg() const { - return *_reg; + reg_type& reg() const { + return *_reg; } - const shared_reg & shreg() const { + const shared_reg& shreg() const { return _reg; } - operator reg_type &() const { + operator reg_type&() const { return reg(); } - operator Xbyak::RegExp () const { + operator Xbyak::RegExp() const { return reg(); } protected: - variable_base(jit_kernel & krnl, const shared_reg & reg); + variable_base(jit_kernel& krnl, const shared_reg& reg); ~variable_base() = default; - jit_kernel & _kernel; + jit_kernel& _kernel; shared_reg _reg; }; -template +template class variable_base { public: using reg_type = const typename reg_traits::type; - variable_base & operator = (const variable_base &) = delete; + variable_base& operator=(const variable_base&) = delete; - variable_base(const variable_base &); - variable_base(variable_base &&); + variable_base(const variable_base&); + variable_base(variable_base&&); - reg_type & reg() const { - return *_addr; + reg_type& reg() const { + return *_addr; } protected: - variable_base(jit_kernel & krnl, const shared_reg & addr); + variable_base(jit_kernel& krnl, const shared_reg& addr); ~variable_base() = default; - jit_kernel & _kernel; + jit_kernel& _kernel; shared_reg _addr; }; -template -class variable : public variable_base::value, T>::type, register_tag> { +template +class variable + : public variable_base::value, T>::type, register_tag> { public: using type = T; using base = variable_base; using reg_type = const typename base::reg_type; using arithmetic_type = typename std::conditional::value, size_t, T>::type; - variable(variable &&) = default; - variable(jit_kernel & krnl); - variable(jit_kernel & krnl, const shared_reg & reg); + variable(variable&&) = default; + variable(jit_kernel& krnl); + variable(jit_kernel& krnl, const shared_reg& reg); - typename std::conditional::value - && !std::is_pointer::type>::value, - variable::type, memory_tag>, void>::type - operator *() const { + typename std::conditional::value && + !std::is_pointer::type>::value, + variable::type, memory_tag>, + void>::type + operator*() const { return variable::type, memory_tag>(base::_kernel, base::shreg()); } - const variable & operator = (reg_type & rhs) const { + const variable& operator=(reg_type& rhs) const { base::_kernel.mov(base::reg(), rhs); return *this; } - template - const variable & operator = (U *rhs) const { + template + const variable& operator=(U* rhs) const { // interpret pointers as size_t base::_kernel.mov(base::reg(), reinterpret_cast(rhs)); return *this; } - const variable & operator = (arithmetic_type rhs) const { + const variable& operator=(arithmetic_type rhs) const { base::_kernel.mov(base::reg(), static_cast(rhs)); return *this; } - const variable & operator += (reg_type & rhs) const { + const variable& operator+=(reg_type& rhs) const { base::_kernel.add(base::reg(), rhs); return *this; } - variable operator + (reg_type & rhs) const { + variable operator+(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res += rhs; return res; } - const variable & operator += (arithmetic_type rhs) const { + const variable& operator+=(arithmetic_type rhs) const { base::_kernel.add(base::reg(), rhs); return *this; } - variable operator + (arithmetic_type rhs) const { + variable operator+(arithmetic_type rhs) const { variable res(base::_kernel); res = base::reg(); res += rhs; return res; } - const variable & operator -= (reg_type & rhs) const { + const variable& operator-=(reg_type& rhs) const { base::_kernel.sub(base::reg(), rhs); return *this; } - variable operator - (reg_type & rhs) const { + variable operator-(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res -= rhs; return res; } - const variable & operator -= (arithmetic_type rhs) const { + const variable& operator-=(arithmetic_type rhs) const { base::_kernel.sub(base::reg(), rhs); return *this; } - variable operator - (arithmetic_type rhs) const { + variable operator-(arithmetic_type rhs) const { variable res(base::_kernel); res = base::reg(); res -= rhs; return res; } - const variable & operator *= (reg_type & rhs) const { + const variable& operator*=(reg_type& rhs) const { base::_kernel.imul(base::reg(), rhs); return *this; } - variable operator * (reg_type & rhs) const { + variable operator*(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res *= rhs; return res; } - const variable & operator *= (arithmetic_type rhs) const { + const variable& operator*=(arithmetic_type rhs) const { base::_kernel.imul(base::reg(), base::reg(), static_cast(rhs)); return *this; } - variable operator * (arithmetic_type rhs) const { + variable operator*(arithmetic_type rhs) const { variable res(base::_kernel); res = base::reg(); res *= rhs; return res; } - const variable & operator &= (reg_type & rhs) const { + const variable& operator&=(reg_type& rhs) const { base::_kernel.and_(base::reg(), rhs); return *this; } - variable operator & (reg_type & rhs) const { + variable operator&(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res &= rhs; return res; } - const variable & operator &= (T rhs) const { + const variable& operator&=(T rhs) const { base::_kernel.and_(base::reg(), rhs); return *this; } - variable operator & (T rhs) const { + variable operator&(T rhs) const { variable res(base::_kernel); res = base::reg(); res &= rhs; return res; } - const variable & operator |= (reg_type & rhs) const { + const variable& operator|=(reg_type& rhs) const { base::_kernel.or_(base::reg(), rhs); return *this; } - variable operator | (reg_type & rhs) const { + variable operator|(reg_type& rhs) const { variable res(base::_kernel); res = base::reg(); res |= rhs; return res; } - const variable & operator |= (T rhs) const { + const variable& operator|=(T rhs) const { base::_kernel.or_(base::reg(), rhs); return *this; } - variable operator | (T rhs) const { + variable operator|(T rhs) const { variable res(base::_kernel); res = base::reg(); res |= rhs; return res; } - const variable & operator >>= (size_t rhs) const { + const variable& operator>>=(size_t rhs) const { base::_kernel.shr(base::reg(), rhs); return *this; } - variable operator >> (size_t rhs) const { + variable operator>>(size_t rhs) const { variable res(base::_kernel); res = base::reg(); res >>= rhs; return res; } - const variable & operator <<= (size_t rhs) const { + const variable& operator<<=(size_t rhs) const { base::_kernel.shl(base::reg(), rhs); return *this; } - variable operator << (size_t rhs) const { + variable operator<<(size_t rhs) const { variable res(base::_kernel); res = base::reg(); res <<= rhs; return res; } - boolean_expression operator == (const variable & rhs) const { + boolean_expression operator==(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::eq, base::shreg(), rhs.shreg()); } - boolean_expression operator == (T rhs) const { + boolean_expression operator==(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::eq, base::shreg(), rhs); } - boolean_expression operator != (const variable & rhs) const { + boolean_expression operator!=(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::neq, base::shreg(), rhs.shreg()); } - boolean_expression operator != (T rhs) const { + boolean_expression operator!=(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::neq, base::shreg(), rhs); } - boolean_expression operator < (const variable & rhs) const { + boolean_expression operator<(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ls, base::shreg(), rhs.shreg()); } - boolean_expression operator < (T rhs) const { + boolean_expression operator<(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ls, base::shreg(), rhs); } - boolean_expression operator > (const variable & rhs) const { + boolean_expression operator>(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::gt, base::shreg(), rhs.shreg()); } - boolean_expression operator > (T rhs) const { + boolean_expression operator>(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::gt, base::shreg(), rhs); } - boolean_expression operator <= (const variable & rhs) const { + boolean_expression operator<=(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::le, base::shreg(), rhs.shreg()); } - boolean_expression operator <= (T rhs) const { + boolean_expression operator<=(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::le, base::shreg(), rhs); } - boolean_expression operator >= (const variable & rhs) const { + boolean_expression operator>=(const variable& rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ge, base::shreg(), rhs.shreg()); } - boolean_expression operator >= (T rhs) const { + boolean_expression operator>=(T rhs) const { return boolean_expression(base::_kernel, boolean_expression::type::ge, base::shreg(), rhs); } // TODO: add necessary operations }; -template +template class variable : public variable_base { public: using type = T; using base = variable_base; using reg_type = const typename base::reg_type; - variable(variable &&) = default; - variable(jit_kernel & krnl, const shared_reg & reg); + variable(variable&&) = default; + variable(jit_kernel& krnl, const shared_reg& reg); - const variable & operator = (const variable & rhs) const; + const variable& operator=(const variable& rhs) const; }; -template +template class variable : public variable_base { public: using type = T[N]; @@ -510,34 +505,34 @@ class variable : public variable_base { using reg_type = const typename base::reg_type; constexpr static size_t length = N; - variable(variable &&) = default; - variable(jit_kernel & krnl); - variable(jit_kernel & krnl, const shared_reg & reg); + variable(variable&&) = default; + variable(jit_kernel& krnl); + variable(jit_kernel& krnl, const shared_reg& reg); - const variable & operator = (reg_type & rhs) const { + const variable& operator=(reg_type& rhs) const { base::_kernel.uni_vmovups(base::reg(), rhs); return *this; } - const variable & operator = (const type & rhs) const { - const type & cref = base::_kernel.constant(rhs); + const variable& operator=(const type& rhs) const { + const type& cref = base::_kernel.constant(rhs); variable creg(base::_kernel); creg = &cref; base::_kernel.uni_vmovdqu(base::reg(), base::_kernel.ptr[creg]); return *this; } - const variable & blend(reg_type & rhs, uint16_t mask) const { + const variable& blend(reg_type& rhs, uint16_t mask) const { base::_kernel.uni_vblendps(base::reg(), rhs, mask); return *this; } - const variable & permute(const std::array & order) const { + const variable& permute(const std::array& order) const { base::_kernel.uni_vpermps(base::reg(), order.data(), base::reg()); return *this; } - const variable & permute(const uint8_t * order) const { + const variable& permute(const uint8_t* order) const { base::_kernel.uni_vpermps(base::reg(), order, base::reg()); return *this; } @@ -546,139 +541,132 @@ class variable : public variable_base { }; class stack_frame { - stack_frame(const stack_frame &) = delete; - stack_frame & operator = (const stack_frame &) = delete; + stack_frame(const stack_frame&) = delete; + stack_frame& operator=(const stack_frame&) = delete; public: - stack_frame(jit_kernel & kernel, size_t size, uint32_t alignment = 1); - stack_frame(stack_frame && rhs); + stack_frame(jit_kernel& kernel, size_t size, uint32_t alignment = 1); + stack_frame(stack_frame&& rhs); ~stack_frame(); - const Xbyak::Reg64 & pointer() const; + const Xbyak::Reg64& pointer() const; void clear() const; private: - jit_kernel & _kernel; + jit_kernel& _kernel; size_t _size; uint32_t _alignment; }; -template +template ov::element::Type type2precision(); dnnl::impl::cpu::x64::cpu_isa_t get_current_isa(); class consts_table { - consts_table(const consts_table &) = delete; - consts_table & operator = (const consts_table &) = delete; + consts_table(const consts_table&) = delete; + consts_table& operator=(const consts_table&) = delete; public: consts_table() = default; - const void * store(const void *data, size_t size); + const void* store(const void* data, size_t size); private: static constexpr const size_t chunk_size = 512; using chunk = std::array; std::list _chunks; - size_t _size {}; + size_t _size{}; }; -} // namespace internal +} // namespace internal struct jit_kernel : public dnnl::impl::cpu::x64::jit_generator { using reg_indices = std::vector; - template + template using reg_traits = internal::reg_traits; - template + template using reg_traits_by_size = internal::reg_traits_by_size; - template + template using isa_traits = internal::isa_traits; using stack_frame = internal::stack_frame; using register_tag = internal::register_tag; using memory_tag = internal::memory_tag; - template + template using variable = internal::variable; - template + template using if_expression = internal::if_expression; - template + template using boolean_expression = internal::boolean_expression; - template + template Xbyak::Address argPtr(U T::*member) const { auto memPtr = &(reinterpret_cast(0)->*member); - const size_t offs = reinterpret_cast(memPtr) - reinterpret_cast(0); + const size_t offs = reinterpret_cast(memPtr) - reinterpret_cast(0); return address_frame(sizeof(U))[param1 + offs]; } - template + template variable arg(U T::*member) { using traits = internal::reg_traits; using reg_type = typename traits::type; - const auto & res = reserve(); + const auto& res = reserve(); if (sizeof(T) < traits::size) movzx(res, argPtr(member)); else mov(res, argPtr(member)); - return { *this, internal::make_shared(res, *this) }; + return {*this, internal::make_shared(res, *this)}; } - template + template variable arg(U T::*member) { using traits = internal::reg_traits; using reg_type = typename traits::type; - const auto & res = reserve(); + const auto& res = reserve(); if (sizeof(T) < traits::size) movzx(res, argPtr(member)); else mov(res, argPtr(member)); - return { *this, internal::make_shared(res, *this) }; - } - - jit_kernel(const char *name); - - template - const RegType & reserve(); - - template - void free(const RegType & reg); - - template - void copy(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size); - template - void copy(const Xbyak::Address& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size); - - template - void load(const variable & dst, const variable & src, size_t length = N); - template - void load(const variable & dst, const variable & src, const variable & length); - template - void store(const variable & dst, const variable & src, size_t length = N); - template - void store(const variable & dst, const variable & src, const variable & length); - - template - void foreach(const B & begin, - const E & end, - std::function&)> && fn, - const S & step = 1); - - template + return {*this, internal::make_shared(res, *this)}; + } + + jit_kernel(const char* name); + + template + const RegType& reserve(); + + template + void free(const RegType& reg); + + template + void copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); + template + void copy(const Xbyak::Address& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); + + template + void load(const variable& dst, const variable& src, size_t length = N); + template + void load(const variable& dst, const variable& src, const variable& length); + template + void store(const variable& dst, const variable& src, size_t length = N); + template + void store(const variable& dst, const variable& src, const variable& length); + + template + void foreach (const B& begin, const E& end, std::function&)> && fn, const S& step = 1); + + template variable var(); - template - variable var(const T & val); + template + variable var(const T& val); - template - const T & constant(const T & c); - template - const T * constant(const T * c, size_t size); + template + const T& constant(const T& c); + template + const T* constant(const T* c, size_t size); stack_frame stack(size_t size, uint32_t alignment = 1); - template - if_expression _if(const boolean_expression & expr) const; + template + if_expression _if(const boolean_expression& expr) const; void uni_vpermps(const Xbyak::Xmm& x1, const uint8_t mask[4], const Xbyak::Operand& op); void uni_vpermps(const Xbyak::Ymm& y1, const uint8_t mask[8], const Xbyak::Operand& op); @@ -689,9 +677,9 @@ struct jit_kernel : public dnnl::impl::cpu::x64::jit_generator { void postamble(); - const Xbyak::AddressFrame & address_frame(size_t size) const; - const reg_indices & free_x64regs() const; - const reg_indices & free_rmmregs() const; + const Xbyak::AddressFrame& address_frame(size_t size) const; + const reg_indices& free_x64regs() const; + const reg_indices& free_rmmregs() const; private: reg_indices _free_x64regs; @@ -703,44 +691,40 @@ struct jit_kernel : public dnnl::impl::cpu::x64::jit_generator { template <> const Xbyak::Reg64& jit_kernel::reserve(); -template -void jit_kernel::copy(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size) { - const auto & addr_frame = address_frame(sizeof(T)); +template +void jit_kernel::copy(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size) { + const auto& addr_frame = address_frame(sizeof(T)); auto p = reserve::type>(); - foreach(0, size, [&](const Xbyak::Reg64& idx) { + foreach (0, size, [&](const Xbyak::Reg64& idx) { mov(p, addr_frame[src + idx * sizeof(T)]); mov(addr_frame[dst + idx * sizeof(T)], p); - }); + }) + ; free(p); } -template -void jit_kernel::copy(const Xbyak::Address& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size) { - const auto & addr_frame = address_frame(sizeof(T)); +template +void jit_kernel::copy(const Xbyak::Address& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size) { + const auto& addr_frame = address_frame(sizeof(T)); auto p = reserve::type>(); auto d = reserve(); lea(d, dst); - foreach(0, size, [&](const Xbyak::Reg64& idx) { + foreach (0, size, [&](const Xbyak::Reg64& idx) { mov(p, addr_frame[src + idx * sizeof(T)]); mov(addr_frame[d + idx * sizeof(T)], p); - }); + }) + ; free(d); free(p); } -template -void jit_kernel::load(const variable & dst, const variable & src, size_t length) { +template +void jit_kernel::load(const variable& dst, const variable& src, size_t length) { static_assert(std::is_same::reg_type, const Xbyak::Reg64>::value, - "Source register must be Reg64"); + "Source register must be Reg64"); - using src_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; - using dst_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; + using src_type = typename std::remove_cv::type>::type; + using dst_type = typename std::remove_cv::type>::type; const std::vector pool_vec_idxs(_free_rmmregs.begin(), _free_rmmregs.end()); const std::vector pool_gpr_idxs(_free_x64regs.begin(), _free_x64regs.end()); @@ -752,17 +736,15 @@ void jit_kernel::load(const variable & dst, const variable & src, if (!_emitters[key]) { _emitters[key].reset(new jit_load_emitter(this, internal::get_current_isa(), src_prc, dst_prc, length)); } - _emitters[key]->emit_code( - { static_cast(static_cast(src).getIdx()) }, - { static_cast(static_cast(dst).getIdx()) }, - pool_vec_idxs, - pool_gpr_idxs); + _emitters[key]->emit_code({static_cast(static_cast(src).getIdx())}, + {static_cast(static_cast(dst).getIdx())}, + pool_vec_idxs, + pool_gpr_idxs); } -template -void jit_kernel::load(const variable & dst, const variable & src, const variable & length) { - using src_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; +template +void jit_kernel::load(const variable& dst, const variable& src, const variable& length) { + using src_type = typename std::remove_cv::type>::type; auto s = stack(N * sizeof(src_type)); s.clear(); @@ -775,15 +757,13 @@ void jit_kernel::load(const variable & dst, const variable & src, load(dst, tmp); } -template -void jit_kernel::store(const variable & dst, const variable & src, size_t length) { +template +void jit_kernel::store(const variable& dst, const variable& src, size_t length) { static_assert(std::is_same::reg_type, const Xbyak::Reg64>::value, - "Destination register must be Reg64"); + "Destination register must be Reg64"); - using src_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; - using dst_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; + using src_type = typename std::remove_cv::type>::type; + using dst_type = typename std::remove_cv::type>::type; const std::vector pool_vec_idxs(_free_rmmregs.begin(), _free_rmmregs.end()); const std::vector pool_gpr_idxs(_free_x64regs.begin(), _free_x64regs.end()); @@ -795,17 +775,15 @@ void jit_kernel::store(const variable & dst, const variable & src if (!_emitters[key]) { _emitters[key].reset(new jit_store_emitter(this, internal::get_current_isa(), src_prc, dst_prc, length)); } - _emitters[key]->emit_code( - { static_cast(static_cast(src).getIdx()) }, - { static_cast(static_cast(dst).getIdx()) }, - pool_vec_idxs, - pool_gpr_idxs); + _emitters[key]->emit_code({static_cast(static_cast(src).getIdx())}, + {static_cast(static_cast(dst).getIdx())}, + pool_vec_idxs, + pool_gpr_idxs); } -template -void jit_kernel::store(const variable & dst, const variable & src, const variable & length) { - using dst_type = typename std::remove_cv< - typename std::remove_pointer::type>::type; +template +void jit_kernel::store(const variable& dst, const variable& src, const variable& length) { + using dst_type = typename std::remove_cv::type>::type; auto s = stack(N * sizeof(dst_type)); @@ -817,11 +795,11 @@ void jit_kernel::store(const variable & dst, const variable & src copy(dst, tmp, length); } -template -void jit_kernel::foreach(const B & begin, - const E & end, - std::function&)> && fn, - const S & step) { +template +void jit_kernel::foreach (const B& begin, + const E& end, + std::function&)> && fn, + const S& step) { using namespace Xbyak; Label loop, exit; @@ -841,36 +819,36 @@ void jit_kernel::foreach(const B & begin, L(exit); } -template +template jit_kernel::variable jit_kernel::var() { using reg_type = typename reg_traits::type; - const auto & reg = reserve(); + const auto& reg = reserve(); return variable(*this, internal::make_shared(reg, *this)); } -template -jit_kernel::variable jit_kernel::var(const T & val) { +template +jit_kernel::variable jit_kernel::var(const T& val) { using reg_type = typename reg_traits::type; - const auto & reg = reserve(); + const auto& reg = reserve(); variable res(*this, internal::make_shared(reg, *this)); res = val; return res; } -template -const T & jit_kernel::constant(const T & c) { +template +const T& jit_kernel::constant(const T& c) { auto res = _consts.store(&c, sizeof c); return *reinterpret_cast(res); } -template -const T * jit_kernel::constant(const T * c, size_t size) { +template +const T* jit_kernel::constant(const T* c, size_t size) { auto res = _consts.store(c, size * sizeof(T)); return reinterpret_cast(res); } -template -jit_kernel::if_expression jit_kernel::_if(const boolean_expression & expr) const { +template +jit_kernel::if_expression jit_kernel::_if(const boolean_expression& expr) const { return if_expression(expr); } @@ -879,12 +857,13 @@ namespace internal { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // shared_reg -template -shared_reg make_shared(Reg & reg, jit_kernel & kernel) { - std::shared_ptr ptr(®, [&kernel](Reg *preg) { +template +shared_reg make_shared(Reg& reg, jit_kernel& kernel) { + std::shared_ptr ptr(®, [&kernel](Reg* preg) { try { kernel.free(*preg); - } catch(...) {} + } catch (...) { + } }); return ptr; } @@ -892,68 +871,68 @@ shared_reg make_shared(Reg & reg, jit_kernel & kernel) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // boolean_expression -template -boolean_expression::boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, const shared_reg & rhs) - : _kernel(kernel) - , _type(t) - , _lhs(lhs) - , _rhs(rhs) - , _rvalue {} { -} - -template -boolean_expression::boolean_expression(jit_kernel & kernel, type t, const shared_reg & lhs, T rhs) - : _kernel(kernel) - , _type(t) - , _lhs(lhs) - , _rvalue(rhs) { -} - -template -void boolean_expression::cmp(const Xbyak::Label & exit) const { +template +boolean_expression::boolean_expression(jit_kernel& kernel, + type t, + const shared_reg& lhs, + const shared_reg& rhs) + : _kernel(kernel), + _type(t), + _lhs(lhs), + _rhs(rhs), + _rvalue{} {} + +template +boolean_expression::boolean_expression(jit_kernel& kernel, type t, const shared_reg& lhs, T rhs) + : _kernel(kernel), + _type(t), + _lhs(lhs), + _rvalue(rhs) {} + +template +void boolean_expression::cmp(const Xbyak::Label& exit) const { if (_rhs) _kernel.cmp(*_lhs, *_rhs); else _kernel.cmp(*_lhs, _rvalue); switch (_type) { - case type::eq: { - _kernel.jne(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::neq: { - _kernel.je(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::ls: { - _kernel.jge(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::gt: { - _kernel.jle(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::le: { - _kernel.jg(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } - case type::ge: { - _kernel.jl(exit, Xbyak::CodeGenerator::T_NEAR); - break; - } + case type::eq: { + _kernel.jne(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::neq: { + _kernel.je(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::ls: { + _kernel.jge(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::gt: { + _kernel.jle(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::le: { + _kernel.jg(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::ge: { + _kernel.jl(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } } } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // then_expression -template -then_expression::then_expression(if_expression & expr) - : _if_expr(expr) {} +template +then_expression::then_expression(if_expression& expr) : _if_expr(expr) {} -template -template -void then_expression::_else(F && fn) { +template +template +void then_expression::_else(F&& fn) { fn(); _if_expr._expr._kernel.L(_if_expr._exit); _if_expr._is_exit_valid = true; @@ -962,75 +941,57 @@ void then_expression::_else(F && fn) { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // variable -template -variable_base::variable_base(jit_kernel & krnl, const shared_reg & reg) - : _kernel(krnl) - , _reg(reg) { -} +template +variable_base::variable_base(jit_kernel& krnl, const shared_reg& reg) + : _kernel(krnl), + _reg(reg) {} -template -variable_base::variable_base(const variable_base & rhs) - : _kernel(rhs._kernel) - , _reg(rhs._reg) { -} +template +variable_base::variable_base(const variable_base& rhs) : _kernel(rhs._kernel), + _reg(rhs._reg) {} -template -variable_base::variable_base(variable_base && rhs) - : _kernel(rhs._kernel) - , _reg(std::move(rhs._reg)) { -} +template +variable_base::variable_base(variable_base&& rhs) : _kernel(rhs._kernel), + _reg(std::move(rhs._reg)) {} -template -variable_base::variable_base(jit_kernel & krnl, const shared_reg & addr) - : _kernel(krnl) - , _addr(addr) { -} +template +variable_base::variable_base(jit_kernel& krnl, const shared_reg& addr) + : _kernel(krnl), + _addr(addr) {} -template -variable_base::variable_base(const variable_base & rhs) - : _kernel(rhs._kernel) - , _addr(rhs._addr) { -} +template +variable_base::variable_base(const variable_base& rhs) : _kernel(rhs._kernel), + _addr(rhs._addr) {} -template -variable_base::variable_base(variable_base && rhs) - : _kernel(rhs._kernel) - , _addr(std::move(rhs._addr)) { -} +template +variable_base::variable_base(variable_base&& rhs) : _kernel(rhs._kernel), + _addr(std::move(rhs._addr)) {} -template -variable::variable(jit_kernel & krnl) - : base(krnl, make_shared(krnl.reserve::type>(), krnl)) { -} +template +variable::variable(jit_kernel& krnl) + : base(krnl, make_shared(krnl.reserve::type>(), krnl)) {} -template -variable::variable(jit_kernel & krnl, const shared_reg & reg) - : base(krnl, reg) { -} +template +variable::variable(jit_kernel& krnl, const shared_reg& reg) : base(krnl, reg) {} -template -variable::variable(jit_kernel & krnl, const shared_reg & reg) - : base(krnl, reg) { -} +template +variable::variable(jit_kernel& krnl, const shared_reg& reg) : base(krnl, reg) {} -template -const variable & variable::operator = (const variable & rhs) const { - const auto & addr_frame = base::_kernel.address_frame(sizeof(T)); +template +const variable& variable::operator=(const variable& rhs) const { + const auto& addr_frame = base::_kernel.address_frame(sizeof(T)); base::_kernel.mov(addr_frame[base::reg()], rhs); return *this; } -template -variable::variable(jit_kernel & krnl) - : base(krnl, make_shared(krnl.reserve::type>(), krnl)) { -} +template +variable::variable(jit_kernel& krnl) + : base(krnl, make_shared(krnl.reserve::type>(), krnl)) {} -template -variable::variable(jit_kernel & krnl, const shared_reg & reg) - : base(krnl, reg) { -} +template +variable::variable(jit_kernel& krnl, const shared_reg& reg) : base(krnl, reg) {} -} // namespace internal +} // namespace internal -} // namespace intel_cpu -} // namespace ov +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp index 8fd3a966e13887..ffc0286431b279 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.cpp @@ -10,14 +10,11 @@ namespace ov { namespace intel_cpu { namespace kernel { -JitKernelBase::JitKernelBase(const char* name, x64::cpu_isa_t isa) - : x64::jit_generator(name, isa), m_isa(isa) { +JitKernelBase::JitKernelBase(const char* name, x64::cpu_isa_t isa) : x64::jit_generator(name, isa), m_isa(isa) { vlen = x64::isa_max_vlen(isa); } -void JitKernelBase::uni_vfmsub132ps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vfmsub132ps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vfmsub132ps(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -31,9 +28,7 @@ void JitKernelBase::uni_vfmsub132ps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vfnmadd132ps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vfnmadd132ps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vfnmadd132ps(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -48,9 +43,7 @@ void JitKernelBase::uni_vfnmadd132ps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vfmsub231ps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vfmsub231ps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vfmsub231ps(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -65,9 +58,7 @@ void JitKernelBase::uni_vfmsub231ps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpaddd(const Xbyak::Ymm& v_dst, - const Xbyak::Ymm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpaddd(const Xbyak::Ymm& v_dst, const Xbyak::Ymm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpaddd(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -99,9 +90,7 @@ void JitKernelBase::uni_vpaddd(const Xbyak::Ymm& v_dst, } } -void JitKernelBase::uni_vpaddq(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpaddq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpaddq(v_dst, v_src, op); } else { @@ -112,9 +101,7 @@ void JitKernelBase::uni_vpaddq(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpsubd(const Xbyak::Ymm& v_dst, - const Xbyak::Ymm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpsubd(const Xbyak::Ymm& v_dst, const Xbyak::Ymm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpsubd(v_dst, v_src, op); } else if (isValidIsa(x64::avx)) { @@ -146,9 +133,7 @@ void JitKernelBase::uni_vpsubd(const Xbyak::Ymm& v_dst, } } -void JitKernelBase::uni_vsubpd(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vsubpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vsubpd(v_dst, v_src, op); } else { @@ -159,9 +144,7 @@ void JitKernelBase::uni_vsubpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vmulpd(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vmulpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vmulpd(v_dst, v_src, op); } else { @@ -172,9 +155,7 @@ void JitKernelBase::uni_vmulpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpmuludq(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpmuludq(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpmuludq(v_dst, v_src, op); } else { @@ -185,9 +166,7 @@ void JitKernelBase::uni_vpmuludq(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vdivps(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op1, - const Xbyak::Operand& op2) { +void JitKernelBase::uni_vdivps(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op1, const Xbyak::Operand& op2) { if (isValidIsa(x64::avx)) { vdivps(v_dst, op1, op2); } else { @@ -198,9 +177,7 @@ void JitKernelBase::uni_vdivps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vdivpd(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& v_src, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vdivpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vdivpd(v_dst, v_src, op); } else { @@ -211,9 +188,7 @@ void JitKernelBase::uni_vdivpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vandps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& vSrs, - const Xbyak::Operand &op) { +void JitKernelBase::uni_vandps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vandps(v_dst, vSrs, op); } else { @@ -224,9 +199,7 @@ void JitKernelBase::uni_vandps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vandnps(const Xbyak::Xmm& v_dst, - const Xbyak::Xmm& vSrs, - const Xbyak::Operand &op) { +void JitKernelBase::uni_vandnps(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vandnps(v_dst, vSrs, op); } else { @@ -237,9 +210,9 @@ void JitKernelBase::uni_vandnps(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, +void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, + const Xbyak::Reg64& rSrcPtr, + const Xbyak::Xmm& vSrcShift, const Xbyak::Opmask& kReadMask, const bool useMask, const bool zeroFill) { @@ -254,17 +227,18 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, vpgatherdd(v_dst | kReadMask, ptr[rSrcPtr + vSrcShift]); } -void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, +void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Xmm& vReadMask, + const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, const bool useMask, const bool zeroFill) { - if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { + if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || + vSrcShift.getIdx() == vReadMask.getIdx()) { OPENVINO_THROW("Any pair of the index, mask, or destination registers cannot be the same."); } if (zeroFill) - pxor(v_dst, v_dst); // Don't use vpxor. It zeros the rest of the YMM register. + pxor(v_dst, v_dst); // Don't use vpxor. It zeros the rest of the YMM register. if (isValidIsa(x64::avx2)) { if (!useMask) @@ -280,7 +254,7 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, Xbyak::Label lLoopNext; if (useMask) { uni_vpextrd(r32Aux, vReadMask, i); - cmp(r32Aux, 0); // TODO: check significant bit + cmp(r32Aux, 0); // TODO: check significant bit je(lLoopNext, T_NEAR); } uni_vpextrd(r32Aux, vSrcShift, i); @@ -292,13 +266,14 @@ void JitKernelBase::gatherdd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, +void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Ymm& vReadMask, + const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, const bool useMask, const bool zeroFill) { - if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || vSrcShift.getIdx() == vReadMask.getIdx()) { + if (v_dst.getIdx() == vSrcShift.getIdx() || v_dst.getIdx() == vReadMask.getIdx() || + vSrcShift.getIdx() == vReadMask.getIdx()) { OPENVINO_THROW("Any pair of the index, mask, or destination registers cannot be the same."); } if (isValidIsa(x64::avx2)) { @@ -309,8 +284,7 @@ void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, vpgatherdd(v_dst, ptr[rSrcPtr + vSrcShift], vReadMask); } else { - Xbyak::Xmm xmmDst = Xbyak::Xmm(v_dst.getIdx()), - xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()), + Xbyak::Xmm xmmDst = Xbyak::Xmm(v_dst.getIdx()), xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()), xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()); for (uint8_t i = 0; i < 2; i++) { gatherdd(xmmDst, rSrcPtr, xmmSrcShft, xmmReadMask, useMask, zeroFill); @@ -323,7 +297,7 @@ void JitKernelBase::gatherdd(const Xbyak::Ymm& v_dst, } } -void JitKernelBase::uni_vpbroadcastq(const Xbyak::Xmm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastq(const Xbyak::Xmm& x, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpbroadcastq(x, op); } else { @@ -332,7 +306,7 @@ void JitKernelBase::uni_vpbroadcastq(const Xbyak::Xmm &x, const Xbyak::Operand & } } -void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm& x, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpbroadcastd(x, op); } else if (isValidIsa(x64::avx)) { @@ -348,7 +322,7 @@ void JitKernelBase::uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand & } } -void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op) { +void JitKernelBase::uni_vpbroadcastd(const Xbyak::Ymm& x, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpbroadcastd(x, op); } else { @@ -375,8 +349,7 @@ void JitKernelBase::uni_vroundpd(const Xbyak::Xmm& v_dst, const Xbyak::Operand& } } -void JitKernelBase::uni_vcvtdq2pd(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vcvtdq2pd(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vcvtdq2pd(v_dst, op); } else { @@ -384,8 +357,7 @@ void JitKernelBase::uni_vcvtdq2pd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vcvtpd2dq(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vcvtpd2dq(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op) { if (isValidIsa(x64::avx)) { vcvtpd2dq(v_dst, op); } else { @@ -393,8 +365,7 @@ void JitKernelBase::uni_vcvtpd2dq(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::uni_vpmovzxdq(const Xbyak::Xmm& v_dst, - const Xbyak::Operand& op) { +void JitKernelBase::uni_vpmovzxdq(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op) { if (isValidIsa(x64::avx2)) { vpmovzxdq(v_dst, op); } else { @@ -416,8 +387,7 @@ void JitKernelBase::uni_vshufpd(const Xbyak::Xmm& v_dst, } } -void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask, - const Xbyak::Reg64& rWorkRest) { +void JitKernelBase::fillRestWorkMask(const Xbyak::Opmask& dstMask, const Xbyak::Reg64& rWorkRest) { auto rOnes = getReg64(); mov(rOnes, 0xFFFFFFFFFFFFFFFF); @@ -493,11 +463,11 @@ void JitKernelBase::fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, L(lEnd); } -void JitKernelBase::load(const Xbyak::Xmm& v_dst, +void JitKernelBase::load(const Xbyak::Xmm& v_dst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFilling) { + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFilling) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not load data with type size ", typeSize); } @@ -523,11 +493,11 @@ void JitKernelBase::load(const Xbyak::Xmm& v_dst, L(lEnd); } -void JitKernelBase::load(const Xbyak::Ymm& v_dst, +void JitKernelBase::load(const Xbyak::Ymm& v_dst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, - const bool zeroFilling) { + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, + const bool zeroFilling) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not load data with type size ", typeSize); } @@ -564,9 +534,9 @@ void JitKernelBase::load(const Xbyak::Ymm& v_dst, } void JitKernelBase::store(const Xbyak::Address& dstAddr, - const Xbyak::Xmm& v_src, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize) { + const Xbyak::Xmm& v_src, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not store data with type size ", typeSize); } @@ -592,9 +562,9 @@ void JitKernelBase::store(const Xbyak::Address& dstAddr, } void JitKernelBase::store(const Xbyak::Address& dstAddr, - const Xbyak::Ymm& v_src, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize) { + const Xbyak::Ymm& v_src, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize) { if (!one_of(typeSize, 1u, 2u, 4u, 8u)) { OPENVINO_THROW("Could not store data with type size ", typeSize); } @@ -631,11 +601,11 @@ void JitKernelBase::store(const Xbyak::Address& dstAddr, void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Xmm& vReadMask, - const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, + const Xbyak::Xmm& vSrcShift, const Xbyak::Reg64& rToStoreNum, - const bool useMask, - const bool zeroFill) { + const bool useMask, + const bool zeroFill) { Xbyak::Label lEnd; auto rAux = getReg64(); Xbyak::Reg32 r32Aux = Xbyak::Reg32(rAux.getIdx()); @@ -671,11 +641,11 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Ymm& vReadMask, - const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, + const Xbyak::Ymm& vSrcShift, const Xbyak::Reg64& rToStoreNum, - const bool useMask, - const bool zeroFill) { + const bool useMask, + const bool zeroFill) { Xbyak::Label lEnd; if (isValidIsa(x64::avx2)) { auto vAux = RegistersPool::Reg(registersPool); @@ -684,8 +654,7 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, } else if (isValidIsa(x64::avx)) { const uint8_t typeSize = sizeof(int); const uint8_t elPerXmm = x64::cpu_isa_traits::vlen / typeSize; - Xbyak::Xmm xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()), - xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()); + Xbyak::Xmm xmmReadMask = Xbyak::Xmm(vReadMask.getIdx()), xmmSrcShft = Xbyak::Xmm(vSrcShift.getIdx()); for (uint8_t i = 0; i < 2; i++) { memMovDD(rDst, rSrc, xmmReadMask, xmmSrcShft, rToStoreNum, useMask, zeroFill); @@ -707,6 +676,6 @@ void JitKernelBase::memMovDD(const Xbyak::Reg64& rDst, L(lEnd); } -} // namespace kernel -} // namespace intel_cpu -} // namespace ov +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp index 260d7196331a7f..eee4ff4d8c0708 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/jit_kernel_base.hpp @@ -7,9 +7,9 @@ #include "openvino/core/visibility.hpp" #if defined(OPENVINO_ARCH_X86_64) -#include "cpu/x64/jit_generator.hpp" -#include "registers_pool.hpp" -#endif // OPENVINO_ARCH_X86_64 +# include "cpu/x64/jit_generator.hpp" +# include "registers_pool.hpp" +#endif // OPENVINO_ARCH_X86_64 namespace ov { namespace intel_cpu { @@ -19,18 +19,22 @@ class JitKernelBase; #if defined(OPENVINO_ARCH_X86_64) -#define getReg64() RegistersPool::Reg(registersPool) -#define getReg32() RegistersPool::Reg(registersPool) -#define getVmm() RegistersPool::Reg(registersPool) -#define getMask() RegistersPool::Reg(registersPool) +# define getReg64() RegistersPool::Reg(registersPool) +# define getReg32() RegistersPool::Reg(registersPool) +# define getVmm() RegistersPool::Reg(registersPool) +# define getMask() RegistersPool::Reg(registersPool) -class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { +class JitKernelBase : public dnnl::impl::cpu::x64::jit_generator { public: JitKernelBase(const char* name, dnnl::impl::cpu::x64::cpu_isa_t max_cpu_isa); - dnnl::impl::cpu::x64::cpu_isa_t getIsa() { return m_isa; } + dnnl::impl::cpu::x64::cpu_isa_t getIsa() { + return m_isa; + } - size_t getVectorLen() { return vlen; } + size_t getVectorLen() { + return vlen; + } void uni_vfmsub132ps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrc, const Xbyak::Operand& op); @@ -62,9 +66,9 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { void uni_vdivpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_src, const Xbyak::Operand& op2); - void uni_vandps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand &op); + void uni_vandps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op); - void uni_vandnps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand &op); + void uni_vandnps(const Xbyak::Xmm& vDst, const Xbyak::Xmm& vSrs, const Xbyak::Operand& op); void uni_kmovd(const Xbyak::Opmask& kDst, const Xbyak::Opmask& kSrc) { kmovd(kDst, kSrc); @@ -82,11 +86,11 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { uni_vandps(kDst, kSrc1, kSrc2); } - void uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op); + void uni_vpbroadcastd(const Xbyak::Xmm& x, const Xbyak::Operand& op); - void uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op); + void uni_vpbroadcastd(const Xbyak::Ymm& x, const Xbyak::Operand& op); - void uni_vpbroadcastq(const Xbyak::Xmm &x, const Xbyak::Operand &op); + void uni_vpbroadcastq(const Xbyak::Xmm& x, const Xbyak::Operand& op); void uni_vroundpd(const Xbyak::Xmm& v_dst, const Xbyak::Operand& op, const uint8_t imm); @@ -98,76 +102,71 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { void uni_vshufpd(const Xbyak::Xmm& v_dst, const Xbyak::Xmm& v_srs, const Xbyak::Operand& op, uint8_t imm); - void gatherdd(const Xbyak::Xmm& vDst, - const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, + void gatherdd(const Xbyak::Xmm& vDst, + const Xbyak::Reg64& rSrcPtr, + const Xbyak::Xmm& vSrcShift, const Xbyak::Opmask& kReadMask, - const bool useMask = true, - const bool zeroFill = false); + const bool useMask = true, + const bool zeroFill = false); - void gatherdd(const Xbyak::Xmm& vDst, + void gatherdd(const Xbyak::Xmm& vDst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Xmm& vSrcShift, - const Xbyak::Xmm& vReadMask, - const bool useMask = true, + const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, + const bool useMask = true, const bool zeroFill = false); - void gatherdd(const Xbyak::Ymm& vDst, + void gatherdd(const Xbyak::Ymm& vDst, const Xbyak::Reg64& rSrcPtr, - const Xbyak::Ymm& vSrcShift, - const Xbyak::Ymm& vReadMask, - const bool useMask = true, + const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, + const bool useMask = true, const bool zeroFill = false); - void fillRestWorkMask(const Xbyak::Opmask& kDstMask, - const Xbyak::Reg64& rWorkRest); + void fillRestWorkMask(const Xbyak::Opmask& kDstMask, const Xbyak::Reg64& rWorkRest); - void fillRestWorkMask(const Xbyak::Xmm& ymmDstMask, - const Xbyak::Reg64& rWorkRest, - const uint64_t typeSize = 4); + void fillRestWorkMask(const Xbyak::Xmm& ymmDstMask, const Xbyak::Reg64& rWorkRest, const uint64_t typeSize = 4); - void fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, - const Xbyak::Reg64& rWorkRest, - const uint64_t typeSize = 4); + void fillRestWorkMask(const Xbyak::Ymm& ymmDstMask, const Xbyak::Reg64& rWorkRest, const uint64_t typeSize = 4); - void load(const Xbyak::Xmm& vDst, + void load(const Xbyak::Xmm& vDst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, const bool zeroFill = false); - void load(const Xbyak::Ymm& vDst, + void load(const Xbyak::Ymm& vDst, const Xbyak::Address& srcAddr, - const Xbyak::Reg64& rLoadNum, - const size_t typeSize, + const Xbyak::Reg64& rLoadNum, + const size_t typeSize, const bool zeroFill = false); void store(const Xbyak::Address& dstAddr, - const Xbyak::Xmm& vSrc, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize); + const Xbyak::Xmm& vSrc, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize); void store(const Xbyak::Address& dstAddr, - const Xbyak::Ymm& vSrc, - const Xbyak::Reg64& rToStoreNum, - const size_t typeSize); + const Xbyak::Ymm& vSrc, + const Xbyak::Reg64& rToStoreNum, + const size_t typeSize); // Makes gather from memory under the vReadMask and writes to the memory m128. void memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Xmm& vReadMask, - const Xbyak::Xmm& vSrcShift, + const Xbyak::Xmm& vReadMask, + const Xbyak::Xmm& vSrcShift, const Xbyak::Reg64& rToStoreCounter, - const bool useMask = true, + const bool useMask = true, const bool zeroFill = false); // Makes gather from the memory under the vReadMask and writes to the memory m256. void memMovDD(const Xbyak::Reg64& rDst, const Xbyak::Reg64& rSrc, - const Xbyak::Ymm& vReadMask, - const Xbyak::Ymm& vSrcShift, + const Xbyak::Ymm& vReadMask, + const Xbyak::Ymm& vSrcShift, const Xbyak::Reg64& rToStoreCounter, - const bool useMask = true, + const bool useMask = true, const bool zeroFill = false); protected: @@ -181,32 +180,37 @@ class JitKernelBase: public dnnl::impl::cpu::x64::jit_generator { enum { // Comparison predicate operand (immediate byte) for single-precision floating-point values. - CMP_EQ_PS = 0, // Equal (ordered, non-signaling) - CMP_LT_PS, // Less-than (ordered, signaling) - CMP_LE_PS, // Less-than-or-equal (ordered, signaling) - CMP_UNORD_PS, // Unordered (non-signaling) - CMP_NEQ_PS, // Not-equal (unordered, non-signaling) - CMP_NLT_PS, // Not-less-than (unordered, signaling) - CMP_NLE_PS, // Not-less-than-or-equal (unordered, signaling) - CMP_ORD_PS // Ordered (non-signaling) + CMP_EQ_PS = 0, // Equal (ordered, non-signaling) + CMP_LT_PS, // Less-than (ordered, signaling) + CMP_LE_PS, // Less-than-or-equal (ordered, signaling) + CMP_UNORD_PS, // Unordered (non-signaling) + CMP_NEQ_PS, // Not-equal (unordered, non-signaling) + CMP_NLT_PS, // Not-less-than (unordered, signaling) + CMP_NLE_PS, // Not-less-than-or-equal (unordered, signaling) + CMP_ORD_PS // Ordered (non-signaling) }; }; -template +template class JitKernel : public JitKernelBase { public: - using KernelFunc = void (*)(const CallArgs *); + using KernelFunc = void (*)(const CallArgs*); explicit JitKernel(const char* name, const CompileParams& jcp, dnnl::impl::cpu::x64::cpu_isa_t max_cpu_isa) - : JitKernelBase{name, max_cpu_isa}, m_jcp{jcp}, m_func{nullptr} {} + : JitKernelBase{name, max_cpu_isa}, + m_jcp{jcp}, + m_func{nullptr} {} ~JitKernel() override = default; dnnl::impl::status_t create_kernel() override { const dnnl::impl::status_t code = jit_generator::create_kernel(); if (code != dnnl::impl::status::success) { - OPENVINO_THROW("Could not create kernel. Error code: ", std::to_string(code), ". ", - "Xbyak error code: ", Xbyak::ConvertErrorToString(Xbyak::GetError())); + OPENVINO_THROW("Could not create kernel. Error code: ", + std::to_string(code), + ". ", + "Xbyak error code: ", + Xbyak::ConvertErrorToString(Xbyak::GetError())); } m_func = (decltype(m_func))jit_ker(); return code; @@ -221,21 +225,21 @@ class JitKernel : public JitKernelBase { this->operator()(&args); } - template class KernelT> + template