diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index f45f4e410c67ba..146775f6189f02 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -112,6 +112,7 @@ jobs: -DNGRAPH_ONNX_IMPORT_ENABLE=ON -DNGRAPH_ONNX_EDITOR_ENABLE=ON -DENABLE_FASTER_BUILD=ON + -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR) workingDirectory: $(BUILD_DIR) diff --git a/.ci/azure/linux_ngraph_onnx.yml b/.ci/azure/linux_ngraph_onnx.yml index e11e72e102d33f..1e13710f2c2b1e 100644 --- a/.ci/azure/linux_ngraph_onnx.yml +++ b/.ci/azure/linux_ngraph_onnx.yml @@ -17,6 +17,8 @@ jobs: WORK_DIR: $(Pipeline.Workspace)/_w MODELS_DIR: /mount/cinfsshare/onnxtestdata TMP_DIR: /mnt/tmp + ONNX_MODEL_ZOO_SHA: "d58213534f2a4d1c4b19ba62b3bb5f544353256e" + steps: - script: | @@ -55,7 +57,7 @@ jobs: - script: docker build --tag=openvino-onnx-ci-image --file=.ci/openvino-onnx/Dockerfile . displayName: 'Docker build' - - script: ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d $(TMP_DIR) -o + - script: ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d $(TMP_DIR) -o -s "$(ONNX_MODEL_ZOO_SHA)" displayName: 'Get models' - script: | @@ -77,6 +79,6 @@ jobs: displayName: 'Create swap' - script: | - docker run --name openvino-onnx-ci-container --volume $(TMP_DIR)/model_zoo:/root/.onnx/model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image + docker run --name openvino-onnx-ci-container --volume $(TMP_DIR)/model_zoo/onnx_model_zoo_$(ONNX_MODEL_ZOO_SHA):/root/.onnx/model_zoo/onnx_model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image /bin/bash -c "tox && tox -e zoo_models" displayName: 'Docker run' diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml index 680ef281ac21fe..04d4c16ea23344 100644 --- a/.ci/azure/mac.yml +++ b/.ci/azure/mac.yml @@ -90,7 +90,7 @@ jobs: # Disable errors with Ninja export CXXFLAGS="-Wno-error=unused-command-line-argument" export CFLAGS="-Wno-error=unused-command-line-argument" - cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR) + cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR) workingDirectory: $(BUILD_DIR) displayName: 'CMake' diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml index 6b4e5203dd08f4..21a36392e33812 100644 --- a/.ci/azure/windows.yml +++ b/.ci/azure/windows.yml @@ -92,7 +92,7 @@ jobs: - script: | set PATH=$(WORK_DIR)\ninja-win;%PATH% - call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR) + call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR) workingDirectory: $(BUILD_DIR) displayName: 'CMake' diff --git a/.ci/openvino-onnx/Jenkinsfile b/.ci/openvino-onnx/Jenkinsfile index 48529879ef162e..ff2e8a451253c4 100644 --- a/.ci/openvino-onnx/Jenkinsfile +++ b/.ci/openvino-onnx/Jenkinsfile @@ -113,8 +113,8 @@ def buildDockerImage(Map configuration, String workdir) { --build-arg BUILD_TYPE=${configuration.build_type} \ --build-arg PROTOBUF_LITE=${configuration.protobuf_lite} \ --file=.ci/openvino-onnx/Dockerfile \ - --build-arg http_proxy=http://proxy-chain.intel.com:911/ \ - --build-arg https_proxy=http://proxy-chain.intel.com:912/ . + --build-arg http_proxy=http://proxy-ir.intel.com:911/ \ + --build-arg https_proxy=http://proxy-ir.intel.com:911/ . """ } @@ -155,10 +155,9 @@ def getConfigurationsMap() { CONFIGURATION_WORKFLOW = { configuration -> node("OpenVINO") { + String workdir = "${HOME}/workspace/${BUILD_NUMBER}_${env.CHANGE_ID}_${configuration.name}" try { PROJECT_NAME = "openvino" - String workdir = "${HOME}/workspace/${BUILD_NUMBER}_${env.CHANGE_ID}_${configuration.name}" - stage("Clone repository") { prepare_repository(workdir) } @@ -185,10 +184,10 @@ CONFIGURATION_WORKFLOW = { configuration -> } finally { stage("Cleanup") { - deleteDir() String docker_container_name = get_docker_container_name(configuration) sh """ docker rm -f ${docker_container_name} + rm -rf ${workdir} """ } } diff --git a/.github/org_control/check_pr.py b/.github/org_control/check_pr.py index 882854724ca646..0e23251e1571a4 100644 --- a/.github/org_control/check_pr.py +++ b/.github/org_control/check_pr.py @@ -139,29 +139,37 @@ def update_labels(gh_api, pull, non_org_intel_pr_users, non_org_pr_users): def get_wrong_commits(pull): """Returns commits with incorrect user and email""" - print("GitHub PR user email:", pull.user.email) + pr_author_email = (pull.user.email or "").lower() + print("GitHub PR author email:", pr_author_email) print("Check commits:") wrong_commits = set() for commit in pull.get_commits(): # import pprint; pprint.pprint(commit.raw_data) print("Commit SHA:", commit.sha) # Use raw data because commit author can be non GitHub user - commit_email = commit.raw_data["commit"]["author"]["email"] - print(" Commit email:", commit_email) + commit_author_email = (commit.raw_data["commit"]["author"]["email"] or "").lower() + commit_committer_email = (commit.raw_data["commit"]["committer"]["email"] or "").lower() + print(" Commit author email:", commit_author_email) + print(" Commit committer email:", commit_committer_email) if not github_api.is_valid_user(commit.author): print( - " ERROR: User with the commit email is absent in GitHub:", + " ERROR: User with the commit author email is absent in GitHub:", commit.raw_data["commit"]["author"]["name"], ) wrong_commits.add(commit.sha) + if not github_api.is_valid_user(commit.committer): + print( + " ERROR: User with the commit committer email is absent in GitHub:", + commit.raw_data["commit"]["committer"]["name"], + ) + wrong_commits.add(commit.sha) if not commit.raw_data["commit"]["verification"]["verified"]: print( " WARNING: The commit is not verified. Reason:", commit.raw_data["commit"]["verification"]["reason"], ) - if pull.user.email != commit_email: - print(" ERROR: Commit email and GitHub user public email are differnt") - wrong_commits.add(commit.sha) + if pr_author_email != commit_author_email or pr_author_email != commit_committer_email: + print(" WARNING: Commit emails and GitHub PR author public email are differnt") return wrong_commits @@ -229,7 +237,7 @@ def main(): if wrong_pulls: for pull_number, wrong_commits in wrong_pulls.items(): print( - f"\nERROR: Remove or replace wrong commits in the PR {pull_number}:\n ", + f"\nERROR: Remove or replace wrong commits in the PR {pull_number}:\n ", "\n ".join(wrong_commits), ) print( diff --git a/.github/workflows/code_style.yml b/.github/workflows/code_style.yml index b538a1793397ef..607fe2cb64ae1a 100644 --- a/.github/workflows/code_style.yml +++ b/.github/workflows/code_style.yml @@ -15,14 +15,17 @@ jobs: - name: Install dependencies run: | sudo apt --assume-yes install libusb-1.0-0-dev + python3 -m pip install --upgrade pip python3 -m pip install -r ./inference-engine/ie_bridges/python/requirements.txt + # Add for -DENABLE_PYTHON=ON, no cython + python3 -m pip install -r ./inference-engine/ie_bridges/python/src/requirements-dev.txt # Run cmake with -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT in order to enable codestyle check for ITT collector - name: CMake run: | mkdir build cd build - cmake -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT .. + cmake -DENABLE_PYTHON=ON -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT .. - name: Check code style run: cmake --build build --target clang_format_check_all diff --git a/.github/workflows/mo.yml b/.github/workflows/mo.yml index eb2ea91484e7ca..ebc0827adb0622 100644 --- a/.github/workflows/mo.yml +++ b/.github/workflows/mo.yml @@ -62,42 +62,4 @@ jobs: mkdir ../mo-ut-logs python3 -m xmlrunner discover -p *_test.py --output=../mo-ut-logs working-directory: model-optimizer - - build_wheel: - name: Build Python wheel - runs-on: ubuntu-18.04 - steps: - - uses: actions/checkout@v2 - - name: Install dependencies - run: | - python3 -m pip install --upgrade pip - python3 -m pip install wheel setuptools - python3 -m pip install tensorflow==2.3.0 - - - name: Build - run: | - python3 setup.py sdist bdist_wheel - working-directory: model-optimizer - - - name: Test package content - run: | - echo "src = open('openvino_mo.egg-info/SOURCES.txt', 'rt').read().split()" | tee -a test_wheel.py - echo "ref = open('automation/package_BOM.txt', 'rt').read().split()" | tee -a test_wheel.py - echo "for name in ref:" | tee -a test_wheel.py - echo " if name.endswith('.py'):" | tee -a test_wheel.py - echo " assert name in src or './' + name in src, name + ' file missed'" | tee -a test_wheel.py - python3 test_wheel.py - working-directory: model-optimizer - - - name: Test conversion - run: | - wget -q http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz - tar -xf mobilenet_v1_1.0_224.tgz - python3 -m pip install model-optimizer/dist/*.whl - python3 -m mo --input_model mobilenet_v1_1.0_224_frozen.pb --input_shape "[1,224,224,3]" - - - uses: actions/upload-artifact@v2 - with: - name: mo_wheel - path: "model-optimizer/dist/*.whl" diff --git a/.gitmodules b/.gitmodules index dad6d39352d7b0..c2cb47a6158025 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,19 +1,7 @@ -[submodule "inference-engine/thirdparty/ade"] - path = inference-engine/thirdparty/ade - url = https://github.com/opencv/ade.git - ignore = dirty [submodule "inference-engine/thirdparty/mkl-dnn"] path = inference-engine/thirdparty/mkl-dnn url = https://github.com/openvinotoolkit/oneDNN.git ignore = dirty -[submodule "inference-engine/tests/ie_test_utils/common_test_utils/gtest"] - path = inference-engine/tests/ie_test_utils/common_test_utils/gtest - url = https://github.com/openvinotoolkit/googletest.git - ignore = dirty -[submodule "inference-engine/samples/thirdparty/gflags"] - path = inference-engine/samples/thirdparty/gflags - url = https://github.com/gflags/gflags.git - ignore = dirty [submodule "thirdparty/xbyak"] path = thirdparty/xbyak url = https://github.com/herumi/xbyak.git @@ -22,3 +10,15 @@ path = thirdparty/zlib/zlib url = https://github.com/madler/zlib.git ignore = dirty +[submodule "thirdparty/ade"] + path = thirdparty/ade + url = https://github.com/opencv/ade.git + ignore = dirty +[submodule "thirdparty/gflags"] + path = thirdparty/gflags + url = https://github.com/gflags/gflags.git + ignore = dirty +[submodule "thirdparty/gtest"] + path = thirdparty/gtest + url = https://github.com/openvinotoolkit/googletest.git + ignore = dirty diff --git a/CMakeLists.txt b/CMakeLists.txt index e0706a72e87e74..3602750435c550 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,10 +169,11 @@ ie_shellcheck_process(DIRECTORY "${OpenVINO_MAIN_SOURCE_DIR}" "${IE_MAIN_SOURCE_DIR}/thirdparty" "${IE_MAIN_SOURCE_DIR}/temp" # TODO fix and enable back: - "${OpenVINO_MAIN_SOURCE_DIR}/scripts/install_dependencies" - "${OpenVINO_MAIN_SOURCE_DIR}/scripts/demo" - "${OpenVINO_MAIN_SOURCE_DIR}/ngraph" - "${IE_MAIN_SOURCE_DIR}/scripts") + "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/scripts/dependencies.sh" + "${OpenVINO_MAIN_SOURCE_DIR}/scripts/install_dependencies/install_NEO_OCL_driver.sh" + "${OpenVINO_MAIN_SOURCE_DIR}/scripts/install_dependencies/install_openvino_dependencies.sh" + "${OpenVINO_MAIN_SOURCE_DIR}/ngraph/python/tests/test_onnx/model_zoo_preprocess.sh" + ) # # cpack diff --git a/README.md b/README.md index 30c314e76de9b3..7ce2e9f998a1f6 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Please report questions, issues and suggestions using: --- \* Other names and brands may be claimed as the property of others. -[Open Model Zoo]:https://github.com/opencv/open_model_zoo +[Open Model Zoo]:https://github.com/openvinotoolkit/open_model_zoo [Inference Engine]:https://software.intel.com/en-us/articles/OpenVINO-InferEngine [Model Optimizer]:https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer [nGraph]:https://docs.openvinotoolkit.org/latest/openvino_docs_nGraph_DG_DevGuide.html diff --git a/cmake/features.cmake b/cmake/features.cmake index fe1b8919b51212..aff805adb15a64 100644 --- a/cmake/features.cmake +++ b/cmake/features.cmake @@ -6,6 +6,8 @@ ie_dependent_option (ENABLE_MKL_DNN "MKL-DNN plugin for inference engine" ON "X8 ie_option (ENABLE_TESTS "unit, behavior and functional tests" OFF) +ie_option (ENABLE_STRICT_DEPENDENCIES "Skip configuring \"convinient\" dependencies for efficient parallel builds" ON) + ie_dependent_option (ENABLE_CLDNN "clDnn based plugin for inference engine" ON "X86_64;NOT APPLE;NOT MINGW;NOT WINDOWS_STORE;NOT WINDOWS_PHONE" OFF) ie_option (ENABLE_PROFILING_ITT "Build with ITT tracing. Optionally configure pre-built ittnotify library though INTEL_VTUNE_DIR variable." OFF) @@ -18,8 +20,6 @@ Supported values:\ ie_option (ENABLE_PROFILING_FIRST_INFERENCE "Build with ITT tracing of first inference time." ON) -ie_option (ENABLE_DOCS "Build docs using Doxygen" OFF) - ie_option(ENABLE_TEMPLATE_PLUGIN "Register template plugin into plugins.xml" OFF) ie_option_enum(SELECTIVE_BUILD "Enable OpenVINO conditional compilation or statistics collection. \ @@ -33,6 +33,9 @@ ie_option(ENABLE_ERROR_HIGHLIGHT "Highlight errors and warnings during compile t find_package(PythonLibs 3 QUIET) ie_dependent_option (ENABLE_PYTHON "enables ie python bridge build" OFF "PYTHONLIBS_FOUND" OFF) +find_package(PythonInterp 3 QUIET) +ie_dependent_option (ENABLE_DOCS "Build docs using Doxygen" OFF "PYTHONINTERP_FOUND" OFF) + # # enable or disable output from NGRAPH_DEBUG statements # diff --git a/docs/IE_DG/API_Changes.md b/docs/IE_DG/API_Changes.md index a234471c13e550..df04ddd6789f07 100644 --- a/docs/IE_DG/API_Changes.md +++ b/docs/IE_DG/API_Changes.md @@ -10,11 +10,56 @@ The sections below contain detailed list of changes made to the Inference Engine ### Deprecated API + **InferenceEngine::Parameter** + * InferenceEngine::Parameter(const std::shared_ptr&) * InferenceEngine::Parameter(std::shared_ptr& var) * std::shared_ptr InferenceEngine::Parameter::asVariant() const * InferenceEngine::Parameter::operator std::shared_ptr() const + **GPU plugin configuration keys** + * KEY_CLDNN_NV12_TWO_INPUTS GPU plugin option. Use KEY_GPU_NV12_TWO_INPUTS instead + * KEY_CLDNN_PLUGIN_PRIORITY GPU plugin option. Use KEY_GPU_PLUGIN_PRIORITY instead + * KEY_CLDNN_PLUGIN_THROTTLE GPU plugin option. Use KEY_GPU_PLUGIN_THROTTLE instead + * KEY_CLDNN_MEM_POOL GPU plugin option + * KEY_CLDNN_GRAPH_DUMPS_DIR GPU plugin option + * KEY_CLDNN_SOURCES_DUMPS_DIR GPU plugin option + * KEY_DUMP_KERNELS GPU plugin option + * KEY_TUNING_MODE GPU plugin option + * KEY_TUNING_FILE GPU plugin option + + **InferenceEngine::IInferRequest** + * IInferRequest interface is deprecated, use InferRequest wrapper: + * Constructor for InferRequest from IInferRequest:: Ptr is deprecated + * Cast operator for InferRequest to IInferRequest shared pointer is deprecated + + **InferenceEngine::ICNNNetwork** + * ICNNNetwork interface is deprecated by means of deprecation of all its methods, use CNNNetwork wrapper + * CNNNetwork methods working with ICNNNetwork are deprecated: + * Cast to ICNNNetwork shared pointer + * Cast to reference to ICNNNetwork interface + * Constructor from ICNNNetwork shared pointer + + **InferenceEngine::IExecutableNetwork** + * IExecutableNetwork is deprecated, use ExecutableNetwork wrappers: + * Constructor of ExecutableNetwork from IExecutableNetwork shared pointer is deprecated + * The following ExecutableNetwork methods are deprecated: + * ExecutableNetwork::reset + * Cast operator to IExecutableNetwork shared pointer + * ExecutableNetwork::CreateInferRequestPtr - use ExecutableNetwork::CreateInferRequest instead + + **Extensions API** + * InferenceEngine::make_so_pointer which is used to create Extensions library is replaced by std::make_shared(..) + * InferenceEngine::IExtension::Release is deprecated with no replacement + * Use IE_DEFINE_EXTENSION_CREATE_FUNCTION helper macro instead of explicit declaration of CreateExtension function, which create extension. + + **Other changes** + * Version::ApiVersion structure is deprecated, Inference Engine does not have API version anymore + * LowLatency - use lowLatency2 instead + * CONFIG_KEY(DUMP_EXEC_GRAPH_AS_DOT) - use InferenceEngine::ExecutableNetwork::GetExecGraphInfo::serialize() instead + * Core::ImportNetwork with no device - pass device name explicitly. + * details::InferenceEngineException - use InferenceEngine::Exception and its derivatives instead. + ## 2021.3 ### New API @@ -528,7 +573,7 @@ The sections below contain detailed list of changes made to the Inference Engine * DLIA_CONFIG_KEY(ENABLE_STREAMING) config key ### Removed API - + * InferenceEngine::EltwiseLayer::Select from InferenceEngine::EltwiseLayer::eOperation enumeration ## 2019 R2 @@ -577,7 +622,7 @@ The sections below contain detailed list of changes made to the Inference Engine * DLIA_CONFIG_KEY(IO_TRANSFORMATIONS_NATIVE) config key * DLIA_CONFIG_KEY(DUMP_SUPPORTED_LAYERS_INFORMATION) config key * GNA_CONFIG_VALUE(SW_FP32) config value for GNA_CONFIG_KEY(DEVICE_MODE) key - * MULTI_CONFIG_KEY(DEVICE_PRIORITIES) config key for `MULTI` device + * MULTI_CONFIG_KEY(DEVICE_PRIORITIES) config key for `MULTI` device * InferenceEngine::CNNNetReader::ReadNetwork(const std::wstring &filepath) new method * InferenceEngine::CNNNetReader::ReadWeights(const std::wstring &filepath) new method * InferenceEngine::ExecutableNetwork::ExecutableNetwork(IExecutableNetwork::Ptr actual, InferenceEnginePluginPtr plg) constructor with additional `plg` parameter @@ -593,7 +638,7 @@ The sections below contain detailed list of changes made to the Inference Engine * InferenceEngine::EltwiseLayer::Logical_NOT, InferenceEngine::EltwiseLayer::Mean, InferenceEngine::EltwiseLayer::Select extensions to InferenceEngine::EltwiseLayer::eOperation enumeration * InferenceEngine::OneHotLayer new class * InferenceEngine::SelectLayer new class - * InferenceEngine::BroadcastLayer new class + * InferenceEngine::BroadcastLayer new class * InferenceEngine::MathLayer new class * InferenceEngine::ReduceLayer new class * InferenceEngine::TopKLayer new class diff --git a/docs/IE_DG/Extensibility_DG/GPU_Kernel.md b/docs/IE_DG/Extensibility_DG/GPU_Kernel.md index 09ace6f0a2942f..d9fd809f8e4227 100644 --- a/docs/IE_DG/Extensibility_DG/GPU_Kernel.md +++ b/docs/IE_DG/Extensibility_DG/GPU_Kernel.md @@ -219,22 +219,6 @@ __kernel void example_relu_kernel( ## Debugging Tips -* **Dumping the Resulting Kernels**. -It is recommended to get a dump of the kernel with all of -the values set by the Inference Engine, such as tensor sizes, -floating-point, and integer kernel parameters. To get the dump, add the -following line to your code that configures the GPU plugin to output the -custom kernels: - -@snippet snippets/GPU_Kernel.cpp part1 - -When the Inference Engine compiles the kernels for the specific network, -it also outputs the resulting code for the custom kernels. In the -directory of your executable, find files like -`clDNN_program0.cl`, `clDNN_program1.cl`. There are as many files as -distinct sets of parameters for your custom kernel: different input -tensor sizes and kernel parameters. - * **Using `printf` in the OpenCL™ Kernels**. To debug the specific values, you can use `printf` in your kernels. However, be careful: for instance, do not output excessively diff --git a/docs/IE_DG/GPU_Kernels_Tuning.md b/docs/IE_DG/GPU_Kernels_Tuning.md deleted file mode 100644 index 5bb6a8334b2372..00000000000000 --- a/docs/IE_DG/GPU_Kernels_Tuning.md +++ /dev/null @@ -1,39 +0,0 @@ -Using GPU Kernels Tuning {#openvino_docs_IE_DG_GPU_Kernels_Tuning} -====================== - -GPU Kernels Tuning allows you to tune models, so the heavy computational layers are configured to fit better into -hardware, which the tuning was done on. It is required to achieve best performance on GPU. -> **NOTE** Currently only convolution and fully connected layers undergo tuning process. It means that the performance boost depends on the amount of that layers in the model. - -OpenVINO™ releases include the `/inference_engine/bin/intel64/Release/cache.json` file with pretuned data for current state of the art models. It is highly recommended to do the -tuning for new kind of models, hardwares or drivers. - -## Tuned data - -GPU tuning data is saved in JSON format. The file is composed of 2 types of attributes and 1 type of value: -* Execution units number (attribute): splits the content into different EU sections -* Hash (attribute): hashed tuned kernel data -* Key (value): Array with kernel name and kernel's mode index - -## Usage - ---- - -You can activate Kernels Tuning process by setting `KEY_TUNING_MODE` flag to `TUNING_CREATE` and `KEY_TUNING_FILE` to `<"filename">` in a configuration map that is -passed to the plugin while loading a network. -This configuration modifies the behavior of the `ExecutableNetwork` object. Instead of standard network compilation, it will run the tuning process. -Please keep in mind that the tuning can be very time consuming. The bigger the network, the longer it will take. -File with tuned data is the result of this step. - -> **NOTE** If a filename passed to `KEY_TUNING_FILE` points to existing tuned data and you are tuning a new model, then this file will be extended by new data. This allows you to extend existing `cache.json` provided in the OpenVINO™ release package. - -The example below shows how to set and use the key files: - -@snippet snippets/GPU_Kernels_Tuning.cpp part0 - ---- - -You can activate the inference with tuned data by setting `KEY_TUNING_MODE` flag to `TUNING_USE_EXISTING` and -`KEY_TUNING_FILE` flag to `<"filename">`. - -GPU backend will process the content of the file during network compilation to configure the OpenCL kernels for the best performance. diff --git a/docs/IE_DG/Int8Inference.md b/docs/IE_DG/Int8Inference.md index 917c7836de293b..889af6a53278b1 100644 --- a/docs/IE_DG/Int8Inference.md +++ b/docs/IE_DG/Int8Inference.md @@ -1,6 +1,13 @@ # Low-Precision 8-bit Integer Inference {#openvino_docs_IE_DG_Int8Inference} -## Disclaimer +## Table of Contents +1. [Supported devices](#supported-devices) +2. [Low-Precision 8-bit Integer Inference Workflow](#low-precision-8-bit-integer-inference-workflow) +3. [Prerequisites](#prerequisites) +4. [Inference](#inference) +5. [Results analysis](#results-analysis) + +## Supported devices Low-precision 8-bit inference is optimized for: - Intel® architecture processors with the following instruction set architecture extensions: @@ -12,16 +19,22 @@ Low-precision 8-bit inference is optimized for: - Intel® Iris® Xe Graphics - Intel® Iris® Xe MAX Graphics - A model must be quantized. You can use a quantized model from [OpenVINO™ Toolkit Intel's Pre-Trained Models](@ref omz_models_group_intel) or quantize a model yourself. For quantization, you can use the: - - [Post-Training Optimization Tool](@ref pot_README) delivered with the Intel® Distribution of OpenVINO™ toolkit release package. + - [Post-Training Optimization Tool](@ref pot_docs_LowPrecisionOptimizationGuide) delivered with the Intel® Distribution of OpenVINO™ toolkit release package. - [Neural Network Compression Framework](https://www.intel.com/content/www/us/en/artificial-intelligence/posts/openvino-nncf.html) available on GitHub: https://github.com/openvinotoolkit/nncf -## Introduction - -A lot of investigation was made in the field of deep learning with the idea of using low precision computations during inference in order to boost deep learning pipelines and gather higher performance. For example, one of the popular approaches is to shrink the precision of activations and weights values from `fp32` precision to smaller ones, for example, to `fp11` or `int8`. For more information about this approach, refer to -**Brief History of Lower Precision in Deep Learning** section in [this whitepaper](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training). +## Low-Precision 8-bit Integer Inference Workflow 8-bit computations (referred to as `int8`) offer better performance compared to the results of inference in higher precision (for example, `fp32`), because they allow loading more data into a single processor instruction. Usually the cost for significant boost is a reduced accuracy. However, it is proved that an accuracy drop can be negligible and depends on task requirements, so that the application engineer can set up the maximum accuracy drop that is acceptable. +For 8-bit integer computations, a model must be quantized. Quantized models can be downloaded from [Overview of OpenVINO™ Toolkit Intel's Pre-Trained Models](@ref omz_models_group_intel). If the model is not quantized, you can use the [Post-Training Optimization Tool](@ref pot_README) to quantize the model. The quantization process adds [FakeQuantize](../ops/quantization/FakeQuantize_1.md) layers on activations and weights for most layers. Read more about mathematical computations in the [Uniform Quantization with Fine-Tuning](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md). + +When you pass the quantized IR to the OpenVINO™ plugin, the plugin automatically recognizes it as a quantized model and performs 8-bit inference. Note, if you pass a quantized model to another plugin that does not support 8-bit inference but supports all operations from the model, the model is inferred in precision that this plugin supports. + +In *Runtime stage* stage, the quantized model is loaded to the plugin. The plugin uses `Low Precision Transformation` component to update the model to infer it in low precision: + - Update `FakeQuantize` layers to have quantized output tensors in low precision range and add dequantization layers to compensate the update. Dequantization layers are pushed through as many layers as possible to have more layers in low precision. After that, most layers have quantized input tensors in low precision range and can be inferred in low precision. Ideally, dequantization layers should be fused in the next `FakeQuantize` layer. + - Weights are quantized and stored in `Constant` layers. + +## Prerequisites Let's explore quantized [TensorFlow* implementation of ResNet-50](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-tf) model. Use [Model Downloader](@ref omz_tools_downloader) tool to download the `fp16` model from [OpenVINO™ Toolkit - Open Model Zoo repository](https://github.com/openvinotoolkit/open_model_zoo): ```sh @@ -31,28 +44,16 @@ After that you should quantize model by the [Model Quantizer](@ref omz_tools_dow ```sh ./quantizer.py --model_dir public/resnet-50-tf --dataset_dir --precisions=FP16-INT8 ``` + +## Inference + The simplest way to infer the model and collect performance counters is [C++ Benchmark Application](../../inference-engine/samples/benchmark_app/README.md). ```sh ./benchmark_app -m resnet-50-tf.xml -d CPU -niter 1 -api sync -report_type average_counters -report_folder pc_report_dir ``` If you infer the model with the OpenVINO™ CPU plugin and collect performance counters, all operations (except last not quantized SoftMax) are executed in INT8 precision. -## Low-Precision 8-bit Integer Inference Workflow - -For 8-bit integer computations, a model must be quantized. Quantized models can be downloaded from [Overview of OpenVINO™ Toolkit Intel's Pre-Trained Models](@ref omz_models_group_intel). If the model is not quantized, you can use the [Post-Training Optimization Tool](@ref pot_README) to quantize the model. The quantization process adds [FakeQuantize](../ops/quantization/FakeQuantize_1.md) layers on activations and weights for most layers. Read more about mathematical computations in the [Uniform Quantization with Fine-Tuning](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md). - -8-bit inference pipeline includes two stages (also refer to the figure below): -1. *Offline stage*, or *model quantization*. During this stage, [FakeQuantize](../ops/quantization/FakeQuantize_1.md) layers are added before most layers to have quantized tensors before layers in a way that low-precision accuracy drop for 8-bit integer inference satisfies the specified threshold. The output of this stage is a quantized model. Quantized model precision is not changed, quantized tensors are in original precision range (`fp32`). `FakeQuantize` layer has `levels` attribute which defines quants count. Quants count defines precision which is used during inference. For `int8` range `levels` attribute value has to be 255 or 256. To quantize the model, you can use the [Post-Training Optimization Tool](@ref pot_README) delivered with the Intel® Distribution of OpenVINO™ toolkit release package. - - When you pass the quantized IR to the OpenVINO™ plugin, the plugin automatically recognizes it as a quantized model and performs 8-bit inference. Note, if you pass a quantized model to another plugin that does not support 8-bit inference but supports all operations from the model, the model is inferred in precision that this plugin supports. - -2. *Runtime stage*. This stage is an internal procedure of the OpenVINO™ plugin. During this stage, the quantized model is loaded to the plugin. The plugin uses `Low Precision Transformation` component to update the model to infer it in low precision: - - Update `FakeQuantize` layers to have quantized output tensors in low precision range and add dequantization layers to compensate the update. Dequantization layers are pushed through as many layers as possible to have more layers in low precision. After that, most layers have quantized input tensors in low precision range and can be inferred in low precision. Ideally, dequantization layers should be fused in the next `FakeQuantize` layer. - - Weights are quantized and stored in `Constant` layers. - -![int8_flow] - -## Performance Counters +## Results analysis Information about layer precision is stored in the performance counters that are available from the Inference Engine API. For example, the part of performance counters table for quantized [TensorFlow* implementation of ResNet-50](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-tf) model inference on [CPU Plugin](supported_plugins/CPU.md) looks as follows: @@ -79,5 +80,3 @@ available from the Inference Engine API. For example, the part of performance co > * Suffix `FP32` for layers computed in 32-bit precision All `Convolution` layers are executed in int8 precision. Rest layers are fused into Convolutions using post operations optimization technique, which is described in [Internal CPU Plugin Optimizations](supported_plugins/CPU.md). - -[int8_flow]: img/cpu_int8_flow.png diff --git a/docs/IE_DG/Intro_to_Performance.md b/docs/IE_DG/Intro_to_Performance.md index 66fcf48c34f3c5..48d1ea5c56cff0 100644 --- a/docs/IE_DG/Intro_to_Performance.md +++ b/docs/IE_DG/Intro_to_Performance.md @@ -1,24 +1,29 @@ # Introduction to the Performance Topics {#openvino_docs_IE_DG_Intro_to_Performance} This section is a shorter version of the -[Optimization Guide](supported_plugins/MULTI.md) for the Intel Deep Learning Deployment Toolkit. +[Optimization Guide](../optimization_guide/dldt_optimization_guide.md) for the Intel® Distribution of OpenVINO™ Toolkit. ## Precision Inference precision directly affects the performance. -Model Optimizer can produce an IR with different precision. For example, float16 IR initially targets VPU and GPU devices, while, for example, the CPU can also execute regular float32. -Also, further device-specific inference precision settings are available, for example, [8-bit integer](Int8Inference.md) or [bfloat16](Bfloat16Inference.md) inference on the CPU. -Note that for [MULTI device](supported_plugins/MULTI.md) that supports automatic inference on multiple devices in parallel, you can use the FP16 IR. +Model Optimizer can produce an IR with different precision. For example, an FP16 IR initially targets VPU and GPU devices, while, for example, for the CPU, an FP16 IR is typically up-scaled to the regular FP32 automatically upon loading. But notice that further device-specific inference precision settings are available, +for example, [8-bit integer](Int8Inference.md) or [bfloat16](Bfloat16Inference.md), which is specific to the CPU inference, below. +Note that for the [MULTI device](supported_plugins/MULTI.md) plugin that supports automatic inference on multiple devices in parallel, you can use an FP16 IR (no need for FP32). You can find more information, including preferred data types for specific devices, in the -[Supported Devices](supported_plugins/Supported_Devices.md) section. +[Supported Devices](supported_plugins/Supported_Devices.md) document. -## Lowering Inference Precision -Default optimization is used for CPU and implies that inference is made with lower precision if it is possible on a given platform to reach better performance with acceptable range of accuracy. -This approach can be used for CPU devices where the platform supports the AVX512_BF16 instruction. In this case, a regular float32 model is converted to [bfloat16](Bfloat16Inference.md) internal representation and inference is provided with bfloat16 layers usage. -Below is the example command line to disable this feature on the CPU device with the AVX512_BF16 instruction and execute regular float32. +## Automatic Lowering of the Inference Precision +By default, plugins enable the optimizations that allow lower precision if the acceptable range of accuracy is preserved. +For example, for the CPU that supports the AVX512_BF16 instructions, an FP16/FP32 model is converted to a [bfloat16](Bfloat16Inference.md) IR to accelerate inference. +To compare the associated speedup, run the example command below to disable this feature on the CPU device with the AVX512_BF16 support and get regular FP32 execution: ``` $ benchmark_app -m -enforcebf16=false ``` +Notice that for quantized (e.g. INT8) models the bfloat16 calculations (of the layers that remain in FP32) is disabled by default. +Refer to the [CPU Plugin documentation](supported_plugins/CPU.md) for more details. + +Similarly, the GPU device automatically executes FP16 for the layers that remain in FP16 in the quantized models (assuming that the FP16 model was quantized). +Refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md). ## Latency vs. Throughput One way to increase computational efficiency is batching, which combines many (potentially tens) of @@ -26,6 +31,12 @@ input images to achieve optimal throughput. However, high batch size also comes latency penalty. So, for more real-time oriented usages, lower batch sizes (as low as a single input) are used. Refer to the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample, which allows latency vs. throughput measuring. +## Using Caching API for first inference latency optimization +Since with the 2021.4 release, Inference Engine provides an ability to enable internal caching of loaded networks. +This can significantly reduce load network latency for some devices at application startup. +Internally caching uses plugin's Export/ImportNetwork flow, like it is done for [Compile tool](../../inference-engine/tools/compile_tool/README.md), using the regular ReadNetwork/LoadNetwork API. +Refer to the [Model Caching Overview](Model_caching_overview.md) for more detailed explanation. + ## Using Async API To gain better performance on accelerators, such as VPU, the Inference Engine uses the asynchronous approach (see [Integrating Inference Engine in Your Application (current API)](Integrate_with_customer_application_new_API.md)). @@ -44,17 +55,17 @@ Below is the example command line that limits the execution to the single socket limited to the single socket). $ numactl -m 0 --physcpubind 0-27 benchmark_app -m -api sync -nthreads 28 ``` -Note that if you have more than one input, running as many inference requests as you have NUMA nodes (or sockets) +Note that if you have more than one input, running as many inference streams as you have NUMA nodes (or sockets) usually gives the same best latency as a single request on the single socket, but much higher throughput. Assuming two NUMA nodes machine: ``` $ benchmark_app -m -nstreams 2 ``` Number of NUMA nodes on the machine can be queried via 'lscpu'. -Please see more on the NUMA support in the [Optimization Guide](supported_plugins/MULTI.md). +Please see more on the NUMA support in the [Optimization Guide](../optimization_guide/dldt_optimization_guide.md). ## Throughput Mode for CPU Unlike most accelerators, CPU is perceived as an inherently latency-oriented device. -Since 2018 R5 release, the Inference Engine introduced the "throughput" mode, which allows the Inference Engine to efficiently run multiple inference requests on the CPU simultaneously, greatly improving the throughput. +OpenVINO™ toolkit provides a "throughput" mode that allows running multiple inference requests on the CPU simultaneously, which greatly improves the throughput. Internally, the execution resources are split/pinned into execution "streams". Using this feature gains much better performance for the networks that originally are not scaled well with a number of threads (for example, lightweight topologies). This is especially pronounced for the many-core server machines. @@ -62,38 +73,26 @@ Using this feature gains much better performance for the networks that originall Run the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) and play with number of infer requests running in parallel, next section. Try different values of the `-nstreams` argument from `1` to a number of CPU cores and find one that provides the best performance. -In addition to the number of streams, it is also possible to play with the batch size to find the throughput sweet-spot. - The throughput mode relaxes the requirement to saturate the CPU by using a large batch: running multiple independent inference requests in parallel often gives much better performance, than using a batch only. This allows you to simplify the app-logic, as you don't need to combine multiple inputs into a batch to achieve good CPU performance. Instead, it is possible to keep a separate infer request per camera or another source of input and process the requests in parallel using Async API. ## Benchmark App [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample is the best performance reference. -It has a lot of device-specific knobs, but the primary usage is as simple as: +It has a lot of device-specific knobs, but the primary usage is as simple as: ```bash $ ./benchmark_app –d GPU –m -i ``` -to measure the performance of the model on the GPU. +to measure the performance of the model on the GPU. Or ```bash $ ./benchmark_app –d CPU –m -i ``` to execute on the CPU instead. -For example, for the CPU throughput mode from the previous section, you can play with number of streams (`-nstreams` command-line param). -Try different values of the `-nstreams` argument from `1` to a number of CPU cores and find one that provides the best performance. For example, on a 8-core CPU, compare the `-nstreams 1` (which is a latency-oriented scenario) to the `2`, `4` and `8` streams. Notice that `benchmark_app` automatically queries/creates/runs number of requests required to saturate the given number of streams. +For example, for the CPU throughput mode from the previous section, you can play with number of streams (`-nstreams` command-line param). +Try different values of the `-nstreams` argument from `1` to a number of CPU cores and find one that provides the best performance. For example, on a 8-core CPU, compare the `-nstreams 1` (which is a latency-oriented scenario) to the `2`, `4` and `8` streams. Notice that `benchmark_app` automatically queries/creates/runs number of requests required to saturate the given number of streams. Finally, notice that when you don't specify number of streams with `-nstreams`, "AUTO" value for the streams is used, e.g. for the CPU this is [CPU_THROUGHPUT_AUTO](supported_plugins/CPU.md). You can spot the actual value behind "AUTO" for your machine in the application output. Notice that the "AUTO" number is not necessarily most optimal, so it is generally recommended to play either with the benchmark_app's "-nstreams" as described above, or via [new Workbench tool](@ref workbench_docs_Workbench_DG_Introduction).This allows you to simplify the app-logic, as you don't need to combine multiple inputs into a batch to achieve good CPU performance. Instead, it is possible to keep a separate infer request per camera or another source of input and process the requests in parallel using Async API. - -## Kernels Tuning for GPU - -GPU backend comes with a feature, that allows models tuning, so the workload is configured to fit better into hardware. - -Tuning is time consuming process, which internally execute every layer several (or even hundreds) times to find most performant configuration. - -This configuration is saved into json-formatted file, whose name can be passed as plugin param to network. GPU backend will process this data to configure kernels for the best performance. - -For more details about Kernels Tuning and How-To please refer to [GPU Kernels Tuning](GPU_Kernels_Tuning.md). diff --git a/docs/IE_DG/Migration_CoreAPI.md b/docs/IE_DG/Migration_CoreAPI.md deleted file mode 100644 index d49bd425bc87c6..00000000000000 --- a/docs/IE_DG/Migration_CoreAPI.md +++ /dev/null @@ -1,70 +0,0 @@ -[DEPRECATED] Migration from Inference Engine Plugin API to Core API {#openvino_docs_IE_DG_Migration_CoreAPI} -=============================== - -For 2019 R2 Release, the new Inference Engine Core API is introduced. This guide is updated to reflect the new API approach. The Inference Engine Plugin API is still supported, but is going to be deprecated in future releases. - -This section provides common steps to migrate your application written using the Inference Engine Plugin API (`InferenceEngine::InferencePlugin`) to the Inference Engine Core API (`InferenceEngine::Core`). - -To learn how to write a new application using the Inference Engine, refer to [Integrate the Inference Engine Request API with Your Application](Integrate_with_customer_application_new_API.md) and [Inference Engine Samples Overview](Samples_Overview.md). - -## Inference Engine Core Class - -The Inference Engine Core class is implemented on top existing Inference Engine Plugin API and handles plugins internally. -The main responsibility of the `InferenceEngine::Core` class is to hide plugin specifics inside and provide a new layer of abstraction that works with devices (`InferenceEngine::Core::GetAvailableDevices`). Almost all methods of this class accept `deviceName` as an additional parameter that denotes an actual device you are working with. Plugins are listed in the `plugins.xml` file, which is loaded during constructing `InferenceEngine::Core` objects: - -```bash - - - - - ... - -``` - -## Migration Steps - -Common migration process includes the following steps: - -1. Migrate from the `InferenceEngine::InferencePlugin` initialization: - -@snippet snippets/Migration_CoreAPI.cpp part0 - -to the `InferenceEngine::Core` class initialization: - -@snippet snippets/Migration_CoreAPI.cpp part1 - -2. Instead of using `InferenceEngine::CNNNetReader` to read IR: - -@snippet snippets/Migration_CoreAPI.cpp part2 - -read networks using the Core class: - -@snippet snippets/Migration_CoreAPI.cpp part3 - -The Core class also allows reading models from the ONNX format (more information is [here](./ONNX_Support.md)): - -@snippet snippets/Migration_CoreAPI.cpp part4 - -3. Instead of adding CPU device extensions to the plugin: - -@snippet snippets/Migration_CoreAPI.cpp part5 - -add extensions to CPU device using the Core class: - -@snippet snippets/Migration_CoreAPI.cpp part6 - -4. Instead of setting configuration keys to a particular plugin, set (key, value) pairs via `InferenceEngine::Core::SetConfig` - -@snippet snippets/Migration_CoreAPI.cpp part7 - -> **NOTE**: If `deviceName` is omitted as the last argument, configuration is set for all Inference Engine devices. - -5. Migrate from loading the network to a particular plugin: - -@snippet snippets/Migration_CoreAPI.cpp part8 - -to `InferenceEngine::Core::LoadNetwork` to a particular device: - -@snippet snippets/Migration_CoreAPI.cpp part9 - -After you have an instance of `InferenceEngine::ExecutableNetwork`, all other steps are as usual. diff --git a/docs/IE_DG/Model_caching_overview.md b/docs/IE_DG/Model_caching_overview.md new file mode 100644 index 00000000000000..25ae7387c244f3 --- /dev/null +++ b/docs/IE_DG/Model_caching_overview.md @@ -0,0 +1,65 @@ +# Model Caching Overview {#openvino_docs_IE_DG_Model_caching_overview} + +## Introduction + +As described in [Inference Engine Introduction](inference_engine_intro.md), common application flow consists of the following steps: + +1. **Create Inference Engine Core object** + +2. **Read the Intermediate Representation** - Read an Intermediate Representation file into an object of the `InferenceEngine::CNNNetwork` + +3. **Prepare inputs and outputs** + +4. **Set configuration** Pass device-specific loading configurations to the device + +5. **Compile and Load Network to device** - Use the `InferenceEngine::Core::LoadNetwork()` method with specific device + +6. **Set input data** + +7. **Execute** + +Step #5 can potentially perform several time-consuming device-specific optimizations and network compilations, +and such delays can lead to bad user experience on application startup. To avoid this, some devices offer +Import/Export network capability, and it is possible to either use [Compile tool](../../inference-engine/tools/compile_tool/README.md) +or enable model caching to export compiled network automatically. Reusing cached networks can significantly reduce load network time. + + +## Set "CACHE_DIR" config option to enable model caching + +To enable model caching, the application must specify the folder where to store cached blobs. It can be done like this + + +@snippet snippets/InferenceEngine_Caching0.cpp part0 + +With this code, if device supports Import/Export network capability, cached blob is automatically created inside the `myCacheFolder` folder +CACHE_DIR config is set to the Core object. If device does not support Import/Export capability, cache is just not created and no error is thrown + +Depending on your device, total time for loading network on application startup can be significantly reduced. +Please also note that very first LoadNetwork (when cache is not yet created) takes slightly longer time to 'export' compiled blob into a cache file +![caching_enabled] + +## Even faster: use LoadNetwork(modelPath) + +In some cases, applications do not need to customize inputs and outputs every time. Such applications always +call `cnnNet = ie.ReadNetwork(...)`, then `ie.LoadNetwork(cnnNet, ..)` and it can be further optimized. +For such cases, more convenient API to load network in one call is introduced in the 2021.4 release. + +@snippet snippets/InferenceEngine_Caching1.cpp part1 + +With enabled model caching, total load time is even smaller - in case that ReadNetwork is optimized as well + +@snippet snippets/InferenceEngine_Caching2.cpp part2 + +![caching_times] + + +## Advanced examples + +Not every device supports network import/export capability, enabling of caching for such devices do not have any effect. +To check in advance if a particular device supports model caching, your application can use the following code: + +@snippet snippets/InferenceEngine_Caching3.cpp part3 + + +[caching_enabled]: ../img/caching_enabled.png +[caching_times]: ../img/caching_times.png diff --git a/docs/IE_DG/OnnxImporterTutorial.md b/docs/IE_DG/OnnxImporterTutorial.md deleted file mode 100644 index f4538633a7e805..00000000000000 --- a/docs/IE_DG/OnnxImporterTutorial.md +++ /dev/null @@ -1,67 +0,0 @@ -# ONNX* Importer API Tutorial {#openvino_docs_IE_DG_OnnxImporterTutorial} - -> **NOTE**: This tutorial is deprecated. Since OpenVINO™ 2020.4 version, Inference Engine enables reading ONNX models via the Inference Engine Core API -> and there is no need to use directly the low-level ONNX* Importer API anymore. -> To read ONNX\* models, it's recommended to use the `Core::ReadNetwork()` method that provide a uniform way to read models from IR or ONNX format. - -This tutorial demonstrates how to use the ONNX\* Importer API. -This API makes it possible to create an nGraph `Function` object from an imported ONNX model. - -All functions of the ONNX Importer API are in the [onnx.hpp][onnx_header] header file. - -Two categories of API functions: -* Helper functions that check which ONNX ops are supported in a current version of the ONNX Importer -* Functions that read ONNX models from a stream or file and result in an nGraph function, which can be executed using the Inference Engine - -## Check Which ONNX Ops Are Supported - -To list all supported ONNX ops in a specific version and domain, use the `get_supported_operators` -as shown in the example below: - -@snippet snippets/OnnxImporterTutorial0.cpp part0 - -The above code produces a list of all the supported operators for the `version` and `domain` you specified and outputs a list similar to this: -```cpp -Abs -Acos -... -Xor -``` - -To determine whether a specific ONNX operator in a particular version and domain is supported by the importer, use the `is_operator_supported` function as shown in the example below: - -@snippet snippets/OnnxImporterTutorial1.cpp part1 - -## Import ONNX Model - -To import an ONNX model, use the `import_onnx_model` function. -The method has two overloads: -* `import_onnx_model` takes a stream as an input, for example, file stream, memory stream -* `import_onnx_model` takes a file path as an input - -Refer to the sections below for details. - -> **NOTE**: The examples below use the ONNX ResNet50 model, which is available at the [ONNX Model Zoo][onnx_model_zoo]: -> ```bash -> $ wget https://s3.amazonaws.com/download.onnx/models/opset_8/resnet50.tar.gz -> $ tar -xzvf resnet50.tar.gz -> ``` - -Once you create the `ng_function`, you can use it to run computation on the Inference Engine. -As it was shown in [Build a Model with nGraph Library](../nGraph_DG/build_function.md), `std::shared_ptr` can be transformed into a `CNNNetwork`. - - -### Stream as Input - -The code below shows how to convert the ONNX ResNet50 model to the nGraph function using `import_onnx_model` with the stream as an input: - -@snippet snippets/OnnxImporterTutorial2.cpp part2 - -### Filepath as Input - -The code below shows how to convert the ONNX ResNet50 model to the nGraph function using `import_onnx_model` with the filepath as an input: - -@snippet snippets/OnnxImporterTutorial3.cpp part3 - -[onnx_header]: https://github.com/NervanaSystems/ngraph/blob/master/src/ngraph/frontend/onnx_import/onnx.hpp -[onnx_model_zoo]: https://github.com/onnx/models diff --git a/docs/IE_DG/img/cpu_int8_flow.png b/docs/IE_DG/img/cpu_int8_flow.png deleted file mode 100644 index 794430126b2877..00000000000000 --- a/docs/IE_DG/img/cpu_int8_flow.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:83bcd7888d3843ddfd9a601288627e98f5874290c00b9988bf1beac9209f2e8d -size 79741 diff --git a/docs/IE_DG/supported_plugins/FPGA.md b/docs/IE_DG/supported_plugins/FPGA.md deleted file mode 100644 index 63ae6e62ed7be0..00000000000000 --- a/docs/IE_DG/supported_plugins/FPGA.md +++ /dev/null @@ -1,22 +0,0 @@ -FPGA Plugin {#openvino_docs_IE_DG_supported_plugins_FPGA} -=========== - -## Product Change Notice -Intel® Distribution of OpenVINO™ toolkit for Intel® Vision Accelerator Design with an Intel® Arria® 10 FPGA and the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA - - - - - - - - - - -
Change Notice BeginsJuly 2020
Change DateOctober 2020
- -Intel will be transitioning to the next-generation programmable deep-learning solution based on FPGAs in order to increase the level of customization possible in FPGA deep-learning. As part of this transition, future standard releases (i.e., non-LTS releases) of Intel® Distribution of OpenVINO™ toolkit will no longer include the Intel® Vision Accelerator Design with an Intel® Arria® 10 FPGA and the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. - -Intel® Distribution of OpenVINO™ toolkit 2020.3.X LTS release will continue to support Intel® Vision Accelerator Design with an Intel® Arria® 10 FPGA and the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. For questions about next-generation programmable deep-learning solutions based on FPGAs, please talk to your sales representative or contact us to get the latest FPGA updates. - -For documentation for the FPGA plugin available in previous releases of Intel® Distribution of OpenVINO™ toolkit with FPGA Support, see documentation for the [2020.4 version](https://docs.openvinotoolkit.org/2020.4/openvino_docs_IE_DG_supported_plugins_FPGA.html) and lower. \ No newline at end of file diff --git a/docs/IE_DG/supported_plugins/CL_DNN.md b/docs/IE_DG/supported_plugins/GPU.md similarity index 62% rename from docs/IE_DG/supported_plugins/CL_DNN.md rename to docs/IE_DG/supported_plugins/GPU.md index 0216ae71d0dd36..cc12be98a121e1 100644 --- a/docs/IE_DG/supported_plugins/CL_DNN.md +++ b/docs/IE_DG/supported_plugins/GPU.md @@ -1,4 +1,4 @@ -GPU Plugin {#openvino_docs_IE_DG_supported_plugins_CL_DNN} +GPU Plugin {#openvino_docs_IE_DG_supported_plugins_GPU} ======= The GPU plugin uses the Intel® Compute Library for Deep Neural Networks (clDNN) to infer deep neural networks. @@ -89,13 +89,10 @@ Some layers are executed during the load time, not during the inference. One of The following layers are not accelerated on the GPU and executed on the host CPU instead: * Proposal -* SimplerNMS +* NonMaxSuppression * PriorBox * DetectionOutput -## Known Layers Limitations -* ROIPooling is supported for 'max' value of 'method' attribute. - ## Supported Configuration Parameters The plugin supports the configuration parameters listed below. @@ -107,31 +104,21 @@ When specifying key values as raw strings (that is, when using Python API), omit | `KEY_CACHE_DIR` | `""` | `""` | Specifies a directory where compiled OCL binaries can be cached. First model loading generates the cache, and all subsequent LoadNetwork calls use precompiled kernels which significantly improves load time. If empty - caching is disabled | | `KEY_PERF_COUNT` | `YES` / `NO` | `NO` | Collect performance counters during inference | | `KEY_CONFIG_FILE` | `" [ ...]"` | `""` | Load custom layer configuration files | -| `KEY_DUMP_KERNELS` | `YES` / `NO` | `NO` | Dump the final kernels used for custom layers | -| `KEY_TUNING_MODE` | `TUNING_DISABLED`
`TUNING_CREATE`
`TUNING_USE_EXISTING` | `TUNING_DISABLED` | Disable inference kernel tuning
Create tuning file (expect much longer runtime)
Use an existing tuning file | -| `KEY_TUNING_FILE` | `""` | `""` | Tuning file to create / use | -| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>` | `0` | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)
Higher value means higher priority for clDNN OpenCL queue. 0 disables the setting. | -| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>` | `0` | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)
Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. | -| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `""` | `""` | clDNN graph optimizer stages dump output directory (in GraphViz format) | -| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `""` | `""` | Final optimized clDNN OpenCL sources dump output directory | -| `KEY_GPU_THROUGHPUT_STREAMS` | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).
This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_CLDNN_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low.
The default value is 1, which implies latency-oriented behavior.
`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams).
A positive integer value creates the requested number of streams. | +| `KEY_GPU_PLUGIN_PRIORITY` | `<0-3>` | `0` | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)
Higher value means higher priority for OpenCL queue. 0 disables the setting. | +| `KEY_GPU_PLUGIN_THROTTLE` | `<0-3>` | `0` | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)
Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. | +| `KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS` | `YES` / `NO` | `YES` | Allows using FP16+INT8 mixed precision mode, so non-quantized parts of a model will be executed in FP16 precision for FP16 IR. Does not affect quantized FP32 IRs | +| `KEY_GPU_NV12_TWO_INPUTS` | `YES` / `NO` | `NO` | Controls preprocessing logic for nv12 input. If it's set to YES, then device graph will expect that user will set biplanar nv12 blob as input wich will be directly passed to device execution graph. Otherwise, preprocessing via GAPI is used to convert NV12->BGR, thus GPU graph have to expect single input | +| `KEY_GPU_THROUGHPUT_STREAMS` | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).
This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_GPU_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low.
The default value is 1, which implies latency-oriented behavior.
`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams).
A positive integer value creates the requested number of streams. | | `KEY_EXCLUSIVE_ASYNC_REQUESTS` | `YES` / `NO` | `NO` | Forces async requests (also from different executable networks) to execute serially.| -| `KEY_CLDNN_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` | Specifies the number of CPU threads that can be used for clDNN engine, e.g, JIT compilation of clDNN kernels or clDNN cpu kernel processing. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the clDNN kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while clDNN plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of clDNN networks that are optimized with multi-threading. | -| `KEY_CLDNN_ENABLE_LOOP_UNROLLING` | `YES` / `NO` | `YES` | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. | - -## Note on Debug Capabilities of the GPU Plugin - -Inference Engine GPU plugin provides possibility to dump the user custom OpenCL™ kernels to a file to allow you to properly debug compilation issues in your custom kernels. - -The application can use the SetConfig() function with the key PluginConfigParams::KEY_DUMP_KERNELS and value: PluginConfigParams::YES. Then during network loading, all custom layers will print their OpenCL kernels with the JIT instrumentation added by the plugin. -The kernels will be stored in the working directory under files named the following way: clDNN_program0.cl, clDNN_program1.cl. - -This option is disabled by default. Additionally, the application can call the SetConfig() function with the key PluginConfigParams::KEY_DUMP_KERNELS and value: PluginConfigParams::NO before network loading. - -How to verify that this option is disabled: -1. Delete all clDNN_program*.cl files from the current directory -2. Run your application to load a network -3. Examine the working directory for the presence of any kernel file (for example, clDNN_program0.cl) +| `KEY_GPU_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` | Specifies the number of CPU threads that can be used for GPU engine, e.g, JIT compilation of GPU kernels or cpu kernel processing within GPU plugin. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the GPU kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while GPU plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of GPU networks that are optimized with multi-threading. | +| `KEY_GPU_ENABLE_LOOP_UNROLLING` | `YES` / `NO` | `YES` | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. | +| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>` | `0` | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)
Higher value means higher priority for OpenCL queue. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_PRIORITY | +| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>` | `0` | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)
Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_THROTTLE | +| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `""` | `""` | clDNN graph optimizer stages dump output directory (in GraphViz format) **Deprecated**. Will be removed in the next release | +| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `""` | `""` | Final optimized clDNN OpenCL sources dump output directory. **Deprecated**. Will be removed in the next release | +| `KEY_DUMP_KERNELS` | `YES` / `NO` | `NO` | Dump the final kernels used for custom layers. **Deprecated**. Will be removed in the next release | +| `KEY_TUNING_MODE` | `TUNING_DISABLED`
`TUNING_CREATE`
`TUNING_USE_EXISTING` | `TUNING_DISABLED` | Disable inference kernel tuning
Create tuning file (expect much longer runtime)
Use an existing tuning file. **Deprecated**. Will be removed in the next release | +| `KEY_TUNING_FILE` | `""` | `""` | Tuning file to create / use. **Deprecated**. Will be removed in the next release | ## GPU Context and Video Memory Sharing RemoteBlob API diff --git a/docs/IE_DG/supported_plugins/Supported_Devices.md b/docs/IE_DG/supported_plugins/Supported_Devices.md index ed8cabec076f03..e1140ae4b74cae 100644 --- a/docs/IE_DG/supported_plugins/Supported_Devices.md +++ b/docs/IE_DG/supported_plugins/Supported_Devices.md @@ -9,11 +9,11 @@ The Inference Engine provides unique capabilities to infer deep learning models | Plugin | Device types | |------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------| -|[GPU plugin](CL_DNN.md) |Intel® Processor Graphics, including Intel® HD Graphics and Intel® Iris® Graphics | +|[GPU plugin](GPU.md) |Intel® Processor Graphics, including Intel® HD Graphics and Intel® Iris® Graphics | |[CPU plugin](CPU.md) |Intel® Xeon® with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel® Core™ Processors with Intel® AVX2, Intel® Atom® Processors with Intel® Streaming SIMD Extensions (Intel® SSE) | |[VPU plugins](VPU.md) (available in the Intel® Distribution of OpenVINO™ toolkit) |Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X, Intel® Vision Accelerator Design with Intel® Movidius™ VPUs | |[GNA plugin](GNA.md) (available in the Intel® Distribution of OpenVINO™ toolkit) |Intel® Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel® Pentium® Silver J5005 Processor, Intel® Pentium® Silver N5000 Processor, Intel® Celeron® J4005 Processor, Intel® Celeron® J4105 Processor, Intel® Celeron® Processor N4100, Intel® Celeron® Processor N4000, Intel® Core™ i3-8121U Processor, Intel® Core™ i7-1065G7 Processor, Intel® Core™ i7-1060G7 Processor, Intel® Core™ i5-1035G4 Processor, Intel® Core™ i5-1035G7 Processor, Intel® Core™ i5-1035G1 Processor, Intel® Core™ i5-1030G7 Processor, Intel® Core™ i5-1030G4 Processor, Intel® Core™ i3-1005G1 Processor, Intel® Core™ i3-1000G1 Processor, Intel® Core™ i3-1000G4 Processor| -|[Multi-Device plugin](MULTI.md) |Multi-Device plugin enables simultaneous inference of the same network on several Intel® devices in parallel | +|[Multi-Device plugin](MULTI.md) |Multi-Device plugin enables simultaneous inference of the same network on several Intel® devices in parallel | |[Heterogeneous plugin](HETERO.md) |Heterogeneous plugin enables automatic inference splitting between several Intel® devices (for example if a device doesn't [support certain layers](#supported-layers)). | Devices similar to the ones we have used for benchmarking can be accessed using [Intel® DevCloud for the Edge](https://devcloud.intel.com/edge/), a remote development environment with access to Intel® hardware and the latest versions of the Intel® Distribution of the OpenVINO™ Toolkit. [Learn more](https://devcloud.intel.com/edge/get_started/devcloud/) or [Register here](https://inteliot.force.com/DevcloudForEdge/s/). @@ -60,7 +60,7 @@ For example, the CHW value at index (c,h,w) is physically located at index (c\*H |GNA plugin |Supported |Supported |Not supported |
\* - currently, only limited set of topologies might benefit from enabling I8 model on GPU
For [Multi-Device](MULTI.md) and [Heterogeneous](HETERO.md) execution -the supported models formats depends on the actual underlying devices. _Generally, FP16 is preferable as it is most ubiquitous and performant_. +the supported models formats depends on the actual underlying devices. _Generally, FP16 is preferable as it is most ubiquitous and performant_. ### Supported Input Precision @@ -73,7 +73,7 @@ the supported models formats depends on the actual underlying devices. _Generall
\* - Supported via `SetBlob` only, `GetBlob` returns FP32
For [Multi-Device](MULTI.md) and [Heterogeneous](HETERO.md) execution -the supported input precision depends on the actual underlying devices. _Generally, U8 is preferable as it is most ubiquitous_. +the supported input precision depends on the actual underlying devices. _Generally, U8 is preferable as it is most ubiquitous_. ### Supported Output Precision @@ -84,7 +84,7 @@ the supported input precision depends on the actual underlying devices. _Genera |VPU plugins |Supported |Supported | |GNA plugin |Supported |Not supported | For [Multi-Device](MULTI.md) and [Heterogeneous](HETERO.md) execution -the supported output precision depends on the actual underlying devices. _Generally, FP32 is preferable as it is most ubiquitous_. +the supported output precision depends on the actual underlying devices. _Generally, FP32 is preferable as it is most ubiquitous_. ### Supported Input Layout diff --git a/docs/IE_PLUGIN_DG/ExecutableNetwork.md b/docs/IE_PLUGIN_DG/ExecutableNetwork.md index c5bfd889857793..ae82b05e4edce3 100644 --- a/docs/IE_PLUGIN_DG/ExecutableNetwork.md +++ b/docs/IE_PLUGIN_DG/ExecutableNetwork.md @@ -49,20 +49,15 @@ The function accepts a const shared pointer to `ngraph::Function` object and per This constructor creates a backend specific graph by importing from a stream object: -> **NOTE**: The export of backend specific graph is done in the `ExportImpl` method, and data formats must be the same for both import and export. +> **NOTE**: The export of backend specific graph is done in the `Export` method, and data formats must be the same for both import and export. @snippet src/template_executable_network.cpp executable_network:ctor_import_stream -### `ExportImpl()` - -**Implementation details:** -Base InferenceEngine::ExecutableNetworkThreadSafeDefault class implements the public InferenceEngine::ExecutableNetworkThreadSafeDefault::Export method as following: -- Writes `_plugin->GetName()` to the `model` stream. -- Calls the `ExportImpl` method defined in a derived class to dump a backend specific graph. +### `Export()` The implementation of the method should write all data to the `model` stream, which is required to import a backend specific graph later in the `Plugin::Import` method: -@snippet src/template_executable_network.cpp executable_network:export_impl +@snippet src/template_executable_network.cpp executable_network:export ### `CreateInferRequest()` diff --git a/docs/IE_PLUGIN_DG/Plugin.md b/docs/IE_PLUGIN_DG/Plugin.md index cadc8660fd3c30..6003eb691fc059 100644 --- a/docs/IE_PLUGIN_DG/Plugin.md +++ b/docs/IE_PLUGIN_DG/Plugin.md @@ -159,21 +159,13 @@ The snippet below provides an example of the implementation for `GetMetric`: > **NOTE**: If an unsupported metric key is passed to the function, it must throw an exception. -### `ImportNetworkImpl()` +### `ImportNetwork()` The importing network mechanism allows to import a previously exported backend specific graph and wrap it using an [ExecutableNetwork](@ref executable_network) object. This functionality is useful if backend specific graph compilation takes significant time and/or cannot be done on a target host device due to other reasons. -**Implementation details:** The base plugin class InferenceEngine::IInferencePlugin implements InferenceEngine::IInferencePlugin::ImportNetwork -as follows: exports a device type (InferenceEngine::IInferencePlugin::_pluginName) and then calls `ImportNetworkImpl`, -which is implemented in a derived class. -If a plugin cannot use the base implementation InferenceEngine::IInferencePlugin::ImportNetwork, it can override base -implementation and define an output blob structure up to its needs. This -can be useful if a plugin exports a blob in a special format for integration with other frameworks -where a common Inference Engine header from a base class implementation is not appropriate. - During export of backend specific graph using `ExecutableNetwork::Export`, a plugin may export any type of information it needs to import a compiled graph properly and check its correctness. For example, the export information may include: diff --git a/docs/IE_PLUGIN_DG/PluginTesting.md b/docs/IE_PLUGIN_DG/PluginTesting.md index 0aae6280601ca1..e2aabeff32e497 100644 --- a/docs/IE_PLUGIN_DG/PluginTesting.md +++ b/docs/IE_PLUGIN_DG/PluginTesting.md @@ -21,7 +21,7 @@ Engine concepts: plugin creation, multiple executable networks support, multiple @snippet single_layer_tests/convolution.cpp test_convolution:declare_parameters - - Instantiate the test itself using standard GoogleTest macro `INSTANTIATE_TEST_CASE_P`: + - Instantiate the test itself using standard GoogleTest macro `INSTANTIATE_TEST_SUITE_P`: @snippet single_layer_tests/convolution.cpp test_convolution:instantiate diff --git a/docs/MO_DG/img/DeepSpeech-0.8.2.png b/docs/MO_DG/img/DeepSpeech-0.8.2.png new file mode 100644 index 00000000000000..ddab04ac34ac29 --- /dev/null +++ b/docs/MO_DG/img/DeepSpeech-0.8.2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdff3768930f683b81ca466be4f947af3172933a702cd38201a254df27a68556 +size 62498 diff --git a/docs/MO_DG/img/DeepSpeech.png b/docs/MO_DG/img/DeepSpeech.png deleted file mode 100644 index b6f1ca96486850..00000000000000 --- a/docs/MO_DG/img/DeepSpeech.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ed2c9052f631055090ef3744117ca5a8e8314e0717ba0fdc984e295caa5b925 -size 112455 diff --git a/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md b/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md index f9aef04a0a9561..bb599cf93b5632 100644 --- a/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md +++ b/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md @@ -627,4 +627,16 @@ It means that you trying to convert the topology which contains '_contrib_box_nm }); -\endhtmlonly \ No newline at end of file +\endhtmlonly + +#### 103. What does the message "ModelOptimizer is not able to parse *.caffemodel" mean? + +If a '*.caffemodel' file exists and it is correct, the error possibly occured due to the use of Python protobuf implementation. In some cases, it shows error message during model parsing, for example: "'utf-8' codec can't decode byte 0xe0 in position 4: invalid continuation byte in field: mo_caffe.SpatialTransformerParameter.transform_type". You can either use Python 3.6/3.7 or build 'cpp' implementation of protobuf yourself for your version of Python. For the complete instructions about building `protobuf` from sources, see the appropriate section in [Converting a Model to Intermediate Representation](Config_Model_Optimizer.md). + +#### 104. What does the message "SyntaxError: 'yield' inside list comprehension" during MxNet\* model conversion mean? + +The issue "SyntaxError: 'yield' inside list comprehension" might occur during converting MXNet\* models (mobilefacedet-v1-mxnet, brain-tumor-segmentation-0001) on Windows* platform with Python* 3.8 environment. This issue is caused by API changes for `yield expression` in Python 3.8. +The following workarounds are suggested to resolve this issue: +1. Use Python 3.6/3.7 to convert MXNet\* models on Windows +2. Update MXNet: pip install mxnet=1.7.0.post2 +Note that you might have conflicts between previously installed PyPI dependencies. \ No newline at end of file diff --git a/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_PyTorch.md b/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_PyTorch.md index a03df559291a06..0898fd7e2225f7 100644 --- a/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_PyTorch.md +++ b/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_PyTorch.md @@ -25,6 +25,8 @@ It is not a full list of models that can be converted to ONNX\* and to IR. * F3Net topology can be converted using [Convert PyTorch\* F3Net to the IR](pytorch_specific/Convert_F3Net.md) instruction. * QuartzNet topologies from [NeMo project](https://github.com/NVIDIA/NeMo) can be converted using [Convert PyTorch\* QuartzNet to the IR](pytorch_specific/Convert_QuartzNet.md) instruction. * YOLACT topology can be converted using [Convert PyTorch\* YOLACT to the IR](pytorch_specific/Convert_YOLACT.md) instruction. +* [RCAN](https://github.com/yulunzhang/RCAN) topologies can be converted using [Convert PyTorch\* RCAN to the IR](pytorch_specific/Convert_RCAN.md) instruction. +* [BERT_NER](https://github.com/kamalkraj/BERT-NER) can be converted using [Convert PyTorch* BERT-NER to the IR](pytorch_specific/Convert_Bert_ner.md) instruction. ## Export PyTorch\* Model to ONNX\* Format diff --git a/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md b/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md index c4721cdead07ee..7e29a7668b2f24 100644 --- a/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md +++ b/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md @@ -161,7 +161,7 @@ Where `HEIGHT` and `WIDTH` are the input images height and width for which the m * [GNMT](https://github.com/tensorflow/nmt) topology can be converted using [these instructions](tf_specific/Convert_GNMT_From_Tensorflow.md). * [BERT](https://github.com/google-research/bert) topology can be converted using [these instructions](tf_specific/Convert_BERT_From_Tensorflow.md). * [XLNet](https://github.com/zihangdai/xlnet) topology can be converted using [these instructions](tf_specific/Convert_XLNet_From_Tensorflow.md). - +* [Attention OCR](https://github.com/emedvedev/attention-ocr) topology can be converted using [these instructions](tf_specific/Convert_AttentionOCR_From_Tensorflow.md). ## Loading Non-Frozen Models to the Model Optimizer diff --git a/docs/MO_DG/prepare_model/convert_model/pytorch_specific/Convert_Bert_ner.md b/docs/MO_DG/prepare_model/convert_model/pytorch_specific/Convert_Bert_ner.md new file mode 100644 index 00000000000000..cc4e78dcb077a8 --- /dev/null +++ b/docs/MO_DG/prepare_model/convert_model/pytorch_specific/Convert_Bert_ner.md @@ -0,0 +1,55 @@ +# Convert PyTorch* BERT-NER to the Intermediate Representation {#openvino_docs_MO_DG_prepare_model_convert_model_pytorch_specific_Convert_Bert_ner} + +## Download and Convert the Model to ONNX* + +To download a pre-trained model or train the model yourself, refer +to the [instruction](https://github.com/kamalkraj/BERT-NER/blob/dev/README.md) in the +BERT-NER model repository. The model with config files is stored in the `out_base` directory. + +To convert the model to ONNX* format, create and run the script with the following content in the root +directory of the model repository. If you download the pre-trained model, you need +to download [`bert.py`](https://github.com/kamalkraj/BERT-NER/blob/dev/bert.py) to run the script. +The instruction was tested with the repository hash commit `e5be564156f194f1becb0d82aeaf6e762d9eb9ed`. + +```python +import torch + +from bert import Ner + +ner = Ner("out_base") + +input_ids, input_mask, segment_ids, valid_positions = ner.preprocess('Steve went to Paris') +input_ids = torch.tensor([input_ids], dtype=torch.long, device=ner.device) +input_mask = torch.tensor([input_mask], dtype=torch.long, device=ner.device) +segment_ids = torch.tensor([segment_ids], dtype=torch.long, device=ner.device) +valid_ids = torch.tensor([valid_positions], dtype=torch.long, device=ner.device) + +ner_model, tknizr, model_config = ner.load_model("out_base") + +with torch.no_grad(): + logits = ner_model(input_ids, segment_ids, input_mask, valid_ids) +torch.onnx.export(ner_model, + (input_ids, segment_ids, input_mask, valid_ids), + "bert-ner.onnx", + input_names=['input_ids', 'segment_ids', 'input_mask', 'valid_ids'], + output_names=['output'], + dynamic_axes={ + "input_ids": {0: "batch_size"}, + "segment_ids": {0: "batch_size"}, + "input_mask": {0: "batch_size"}, + "valid_ids": {0: "batch_size"}, + "output": {0: "output"} + }, + opset_version=11, + ) +``` + +The script generates ONNX* model file `bert-ner.onnx`. + +## Convert ONNX* BERT-NER model to IR + +```bash +python mo.py --input_model bert-ner.onnx --input "input_mask[1 128],segment_ids[1 128],input_ids[1 128]" +``` + +where `1` is `batch_size` and `128` is `sequence_length`. \ No newline at end of file diff --git a/docs/MO_DG/prepare_model/convert_model/pytorch_specific/Convert_RCAN.md b/docs/MO_DG/prepare_model/convert_model/pytorch_specific/Convert_RCAN.md new file mode 100644 index 00000000000000..8fdefe128a6ecb --- /dev/null +++ b/docs/MO_DG/prepare_model/convert_model/pytorch_specific/Convert_RCAN.md @@ -0,0 +1,31 @@ +# Convert PyTorch* RCAN to the Intermediate Representation {#openvino_docs_MO_DG_prepare_model_convert_model_pytorch_specific_Convert_RCAN} + +[RCAN](https://github.com/yulunzhang/RCAN): Image Super-Resolution Using Very Deep Residual Channel Attention Networks + +## Download and Convert the Model to ONNX* + +To download the pre-trained model or train the model yourself, refer to the +[instruction](https://github.com/yulunzhang/RCAN/blob/master/README.md) in the RCAN model repository. Firstly, +convert the model to ONNX\* format. Create and run the script with the following content in the root +directory of the model repository: +```python +from argparse import Namespace + +import torch + +from RCAN_TestCode.code.model.rcan import RCAN + +config = Namespace(n_feats=64, n_resblocks=4, n_resgroups=2, reduction=16, scale=[2], data_train='DIV2K', res_scale=1, + n_colors=3, rgb_range=255) +net = RCAN(config) +net.eval() +dummy_input = torch.randn(1, 3, 360, 640) +torch.onnx.export(net, dummy_input, 'RCAN.onnx') +``` +The script generates the ONNX\* model file RCAN.onnx. You can find more information about model parameters (`n_resblocks`, `n_resgroups`, and others) in the model repository and use different values of them. The model conversion was tested with the repository hash commit `3339ebc59519c3bb2b5719b87dd36515ec7f3ba7`. + +## Convert ONNX* RCAN Model to IR + +```sh +./mo.py --input_model RCAN.onnx +``` diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_AttentionOCR_From_Tensorflow.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_AttentionOCR_From_Tensorflow.md new file mode 100644 index 00000000000000..90e94677dd7f33 --- /dev/null +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_AttentionOCR_From_Tensorflow.md @@ -0,0 +1,35 @@ +# Convert TensorFlow* Attention OCR Model to Intermediate Representation {#openvino_docs_MO_DG_prepare_model_convert_model_tf_specific_Convert_AttentionOCR_From_Tensorflow} + +This tutorial explains how to convert the Attention OCR (AOCR) model from the [TensorFlow* Attention OCR repository](https://github.com/emedvedev/attention-ocr) to the Intermediate Representation (IR). + +## Extract Model from `aocr` Library + +The easiest way to get an AOCR model is to download `aocr` Python\* library: +``` +pip install git+https://github.com/emedvedev/attention-ocr.git@master#egg=aocr +``` +This library contains a pretrained model and allows to train and run AOCR using the command line. After installing `aocr`, you can extract the model: +``` +aocr export --format=frozengraph model/path/ +``` +After this step you can find the model in model/path/ folder. + +## Convert the TensorFlow* AOCR Model to IR + +The original AOCR model contains data preprocessing which consists of the following steps: +* Decoding input data to binary format where input data is an image represented as a string. +* Resizing binary image to working resolution. + +After that, the resized image is sent to the convolution neural network (CNN). The Model Optimizer does not support image decoding so you should cut of preprocessing part of the model using '--input' command line parameter. +```sh +python3 path/to/model_optimizer/mo_tf.py \ +--input_model=model/path/frozen_graph.pb \ +--input="map/TensorArrayStack/TensorArrayGatherV3:0[1 32 86 1]" \ +--output "transpose_1,transpose_2" \ +--output_dir path/to/ir/ +``` + +Where: +* `map/TensorArrayStack/TensorArrayGatherV3:0[1 32 86 1]` - name of node producing tensor after preprocessing. +* `transpose_1` - name of the node producing tensor with predicted characters. +* `transpose_2` - name of the node producing tensor with predicted characters probabilties \ No newline at end of file diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_DeepSpeech_From_Tensorflow.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_DeepSpeech_From_Tensorflow.md index 74833cf3ad3332..29df0e4695d330 100644 --- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_DeepSpeech_From_Tensorflow.md +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_DeepSpeech_From_Tensorflow.md @@ -2,66 +2,81 @@ [DeepSpeech project](https://github.com/mozilla/DeepSpeech) provides an engine to train speech-to-text models. -## Download the Pre-Trained DeepSpeech Model +## Download the Pretrained DeepSpeech Model -[Pre-trained English speech-to-text model](https://github.com/mozilla/DeepSpeech#getting-the-pre-trained-model) -is publicly available. To download the model, please follow the instruction below: +Create a directory where model and metagraph with pretrained weights will be stored: +``` +mkdir deepspeech +cd deepspeech +``` +[Pretrained English speech-to-text model](https://github.com/mozilla/DeepSpeech/releases/tag/v0.8.2) is publicly available. +To download the model, follow the instruction below: * For UNIX*-like systems, run the following command: ``` -wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz | tar xvfz - +wget -O - https://github.com/mozilla/DeepSpeech/archive/v0.8.2.tar.gz | tar xvfz - +wget -O - https://github.com/mozilla/DeepSpeech/releases/download/v0.8.2/deepspeech-0.8.2-checkpoint.tar.gz | tar xvfz - ``` * For Windows* systems: - 1. Download the archive from the DeepSpeech project repository: [https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.3.0/deepspeech-0.3.0-models.tar.gz). - 2. Unpack it with a file archiver application. + 1. Download the archive with the model: [https://github.com/mozilla/DeepSpeech/archive/v0.8.2.tar.gz](https://github.com/mozilla/DeepSpeech/archive/v0.8.2.tar.gz). + 2. Download the TensorFlow\* MetaGraph with pretrained weights: [https://github.com/mozilla/DeepSpeech/releases/download/v0.8.2/deepspeech-0.8.2-checkpoint.tar.gz](https://github.com/mozilla/DeepSpeech/releases/download/v0.8.2/deepspeech-0.8.2-checkpoint.tar.gz). + 3. Unpack it with a file archiver application. + +## Freeze the Model into a *.pb File -After you unpack the archive with the pre-trained model, you will have the new `models` directory with the -following files: +After unpacking the archives above, you have to freeze the model. Note that this requires +TensorFlow* version 1 which is not available under Python 3.8, so you need Python 3.7 or lower. +Before freezing, deploy a virtual environment and install the required packages: ``` -alphabet.txt -lm.binary -output_graph.pb -output_graph.pbmm -output_graph.rounded.pb -output_graph.rounded.pbmm -trie +virtualenv --python=python3.7 venv-deep-speech +source venv-deep-speech/bin/activate +cd DeepSpeech-0.8.2 +pip3 install -e . ``` +Freeze the model with the following command: +``` +python3 DeepSpeech.py --checkpoint_dir ../deepspeech-0.8.2-checkpoint --export_dir ../ +``` +After that, you will get the pretrained frozen model file `output_graph.pb` in the directory `deepspeech` created at +the beginning. The model contains the preprocessing and main parts. The first preprocessing part performs conversion of input +spectrogram into a form useful for speech recognition (mel). This part of the model is not convertible into +IR because it contains unsupported operations `AudioSpectrogram` and `Mfcc`. -Pre-trained frozen model file is `output_graph.pb`. - -![DeepSpeech model view](../../../img/DeepSpeech.png) +The main and most computationally expensive part of the model converts the preprocessed audio into text. +There are two specificities with the supported part of the model. -As you can see, the frozen model still has two variables: `previous_state_c` and -`previous_state_h`. It means that the model keeps training those variables at each inference. +The first is that the model contains an input with sequence length. So the model can be converted with +a fixed input length shape, thus the model is not reshapeable. +Refer to the [Using Shape Inference](../../../../IE_DG/ShapeInference.md). -At the first inference of this graph, the variables are initialized by zero tensors. After executing the `lstm_fused_cell` nodes, cell state and hidden state, which are the results of the `BlockLSTM` execution, are assigned to these two variables. +The second is that the frozen model still has two variables: `previous_state_c` and `previous_state_h`, figure +with the frozen *.pb model is below. It means that the model keeps training these variables at each inference. -With each inference of the DeepSpeech graph, initial cell state and hidden state data for `BlockLSTM` is taken from previous inference from variables. Outputs (cell state and hidden state) of `BlockLSTM` are reassigned to the same variables. +![DeepSpeech model view](../../../img/DeepSpeech-0.8.2.png) -It helps the model to remember the context of the words that it takes as input. +At the first inference the variables are initialized with zero tensors. After executing, the results of the `BlockLSTM` +are assigned to cell state and hidden state, which are these two variables. -## Convert the TensorFlow* DeepSpeech Model to IR +## Convert the Main Part of DeepSpeech Model into IR -The Model Optimizer assumes that the output model is for inference only. That is why you should cut those variables off and resolve keeping cell and hidden states on the application level. +Model Optimizer assumes that the output model is for inference only. That is why you should cut `previous_state_c` +and `previous_state_h` variables off and resolve keeping cell and hidden states on the application level. There are certain limitations for the model conversion: - Time length (`time_len`) and sequence length (`seq_len`) are equal. - Original model cannot be reshaped, so you should keep original shapes. -To generate the DeepSpeech Intermediate Representation (IR), provide the TensorFlow DeepSpeech model to the Model Optimizer with the following parameters: +To generate the IR, run the Model Optimizer with the following parameters: ```sh -python3 ./mo_tf.py \ ---input_model path_to_model/output_graph.pb \ ---freeze_placeholder_with_value input_lengths->[16] \ ---input input_node,previous_state_h/read,previous_state_c/read \ ---input_shape [1,16,19,26],[1,2048],[1,2048] \ ---output raw_logits,lstm_fused_cell/GatherNd,lstm_fused_cell/GatherNd_1 \ +python3 {path_to_mo}/mo_tf.py \ +--input_model output_graph.pb \ +--input "input_lengths->[16],input_node[1 16 19 26],previous_state_h[1 2048],previous_state_c[1 2048]" \ +--output "cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/GatherNd_1,cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/GatherNd,logits" \ --disable_nhwc_to_nchw ``` Where: -* `--freeze_placeholder_with_value input_lengths->[16]` freezes sequence length -* `--input input_node,previous_state_h/read,previous_state_c/read` and -`--input_shape [1,16,19,26],[1,2048],[1,2048]` replace the variables with a placeholder -* `--output raw_logits,lstm_fused_cell/GatherNd,lstm_fused_cell/GatherNd_1` gets data for the next model -execution. +* `input_lengths->[16]` Replaces the input node with name "input_lengths" with a constant tensor of shape [1] with a + single integer value 16. This means that the model now can consume input sequences of length 16 only. +* `input_node[1 16 19 26],previous_state_h[1 2048],previous_state_c[1 2048]` replaces the variables with a placeholder. +* `--output ".../GatherNd_1,.../GatherNd,logits" ` output node names. diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_YOLO_From_Tensorflow.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_YOLO_From_Tensorflow.md index 653165576ce125..eb2a1611e0251e 100644 --- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_YOLO_From_Tensorflow.md +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_YOLO_From_Tensorflow.md @@ -1,18 +1,49 @@ # Converting YOLO* Models to the Intermediate Representation (IR) {#openvino_docs_MO_DG_prepare_model_convert_model_tf_specific_Convert_YOLO_From_Tensorflow} -This tutorial explains how to convert real-time object detection YOLOv1\*, YOLOv2\*, and YOLOv3\* public models to the Intermediate Representation (IR). All YOLO\* models are originally implemented in the DarkNet\* framework and consist of two files: +This document explains how to convert real-time object detection YOLOv1\*, YOLOv2\*, YOLOv3\* and YOLOv4\* public models to the Intermediate Representation (IR). All YOLO\* models are originally implemented in the DarkNet\* framework and consist of two files: * `.cfg` file with model configurations * `.weights` file with model weights Depending on a YOLO model version, the Model Optimizer converts it differently: -- YOLOv3 has several implementations. This tutorial uses a TensorFlow implementation of YOLOv3 model, which can be directly converted to the IR. +- YOLOv4 must be first converted from Keras\* to TensorFlow 2\*. +- YOLOv3 has several implementations. This tutorial uses a TensorFlow implementation of YOLOv3 model, which can be directly converted to an IR. - YOLOv1 and YOLOv2 models must be first converted to TensorFlow\* using DarkFlow\*. +## Convert YOLOv4 Model to IR + +This section explains how to convert the YOLOv4 Keras\* model from the [https://github.com/Ma-Dan/keras-yolo4](https://github.com/Ma-Dan/keras-yolo4]) repository to an IR. To convert the YOLOv4 model, follow the instructions below: + +1. Download YOLOv4 weights from [yolov4.weights](https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT). + +2. Clone the repository with the YOLOv4 model. +```sh +git clone https://github.com/Ma-Dan/keras-yolo4.git +``` + +3. Convert the model to the TensorFlow 2\* format. Save the code below to the `converter.py` file in the same folder as you downloaded `yolov4.weights` and run it. +```python +from keras-yolo4.model import Mish + +model = tf.keras.models.load_model('yolo4_weight.h5', custom_objects={'Mish': Mish}) +tf.saved_model.save(model, 'yolov4') +``` + +```sh +python converter.py +``` + +4. Run Model Optimizer to converter the model from the TensorFlow 2 format to an IR: + +> **NOTE:** Before you run the convertion, make sure you have installed all the Model Optimizer dependencies for TensorFlow 2. +```sh +python mo.py --saved_model_dir yolov4 --output_dir models/IRs --input_shape [1,608,608,3] --model_name yolov4 +``` + ## Convert YOLOv3 Model to IR -On GitHub*, you can find several public versions of TensorFlow YOLOv3 model implementation. This tutorial explains how to convert YOLOv3 model from -the [https://github.com/mystic123/tensorflow-yolo-v3](https://github.com/mystic123/tensorflow-yolo-v3) repository (commit ed60b90) to IR , but the process is similar for other versions of TensorFlow YOLOv3 model. +On GitHub*, you can find several public versions of TensorFlow YOLOv3 model implementation. This section explains how to convert YOLOv3 model from +the [https://github.com/mystic123/tensorflow-yolo-v3](https://github.com/mystic123/tensorflow-yolo-v3) repository (commit ed60b90) to an IR , but the process is similar for other versions of TensorFlow YOLOv3 model. ### Overview of YOLOv3 Model Architecture Originally, YOLOv3 model includes feature extractor called `Darknet-53` with three branches at the end that make detections at three different scales. These branches must end with the YOLO `Region` layer. @@ -45,7 +76,7 @@ python3 convert_weights_pb.py --class_names coco.names --data_format NHWC --weig ```sh python3 convert_weights_pb.py --class_names coco.names --data_format NHWC --weights_file yolov3-tiny.weights --tiny ``` -At this step, you may receive a warning like `WARNING:tensorflow:Entity <...> could not be transformed and will be executed as-is.`. To workaround this issue, switch to gast 0.2.2 with the following command: +At this step, you may receive a warning like `WARNING:tensorflow:Entity <...> could not be transformed and will be executed as-is.`. To work around this issue, switch to gast 0.2.2 with the following command: ```sh pip3 install --user gast==0.2.2 ``` @@ -55,7 +86,7 @@ If you have YOLOv3 weights trained for an input image with the size different fr python3 convert_weights_pb.py --class_names coco.names --data_format NHWC --weights_file yolov3_608.weights --size 608 ``` -### Convert YOLOv3 TensorFlow Model to the IR +### Convert YOLOv3 TensorFlow Model to IR To solve the problems explained in the YOLOv3 architecture overview section, use the `yolo_v3.json` or `yolo_v3_tiny.json` (depending on a model) configuration file with custom operations located in the `/deployment_tools/model_optimizer/extensions/front/tf` repository. @@ -79,7 +110,7 @@ It consists of several attributes:
where: - `id` and `match_kind` are parameters that you cannot change. - `custom_attributes` is a parameter that stores all the YOLOv3 specific attributes: - - `classes`, `coords`, `num`, and `masks` are attributes that you should copy from the configuration file + - `classes`, `coords`, `num`, and `masks` are attributes that you should copy from the configuration file that was used for model training. If you used DarkNet officially shared weights, you can use `yolov3.cfg` or `yolov3-tiny.cfg` configuration file from https://github.com/pjreddie/darknet/tree/master/cfg. Replace the default values in `custom_attributes` with the parameters that follow the `[yolo]` titles in the configuration file. @@ -87,7 +118,7 @@ where: - `entry_points` is a node name list to cut off the model and append the Region layer with custom attributes specified above. -To generate the IR of the YOLOv3 TensorFlow model, run:
+To generate an IR of the YOLOv3 TensorFlow model, run:
```sh python3 mo_tf.py \ --input_model /path/to/yolo_v3.pb \ @@ -96,7 +127,7 @@ python3 mo_tf.py \ --output_dir ``` -To generate the IR of the YOLOv3-tiny TensorFlow model, run:
+To generate an IR of the YOLOv3-tiny TensorFlow model, run:
```sh python3 mo_tf.py \ --input_model /path/to/yolo_v3_tiny.pb \ @@ -179,4 +210,4 @@ The model was trained with input values in the range `[0,1]`. OpenVINO™ to * `--transformations_config` adds missing `Region` layers to the model. In the IR, the `Region` layer has name `RegionYolo`. For other applicable parameters, refer to [Convert Model from TensorFlow](../Convert_Model_From_TensorFlow.md). -> **NOTE:** The color channel order (RGB or BGR) of an input data should match the channel order of the model training dataset. If they are different, perform the `RGB<->BGR` conversion specifying the command-line parameter: `--reverse_input_channels`. Otherwise, inference results may be incorrect. For more information about the parameter, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](../Converting_Model_General.md). +> **NOTE:** The color channel order (RGB or BGR) of an input data should match the channel order of the model training dataset. If they are different, perform the `RGB<->BGR` conversion specifying the command-line parameter: `--reverse_input_channels`. Otherwise, inference results may be incorrect. For more information about the parameter, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](../Converting_Model_General.md). \ No newline at end of file diff --git a/docs/benchmarks/performance_benchmarks_faq.md b/docs/benchmarks/performance_benchmarks_faq.md index f48c3cf38fde52..a89d0fc07c3e90 100644 --- a/docs/benchmarks/performance_benchmarks_faq.md +++ b/docs/benchmarks/performance_benchmarks_faq.md @@ -6,7 +6,7 @@ The following questions and answers are related to [performance benchmarks](./pe New performance benchmarks are typically published on every `major.minor` release of the Intel® Distribution of OpenVINO™ toolkit. #### 2. Where can I find the models used in the performance benchmarks? -All of the models used are included in the toolkit's [Open Model Zoo](https://github.com/opencv/open_model_zoo) GitHub repository. +All of the models used are included in the toolkit's [Open Model Zoo](https://github.com/openvinotoolkit/open_model_zoo) GitHub repository. #### 3. Will there be new models added to the list used for benchmarking? The models used in the performance benchmarks were chosen based on general adoption and usage in deployment scenarios. We're continuing to add new models that support a diverse set of workloads and usage. @@ -21,23 +21,23 @@ All of the performance benchmarks were generated using the open-sourced tool wit The image size used in the inference depends on the network being benchmarked. The following table shows the list of input sizes for each network model. | **Model** | **Public Network** | **Task** | **Input Size** (Height x Width) | |------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|-----------------------------|-----------------------------------| -| [bert-large-uncased-whole-word-masking-squad](https://github.com/opencv/open_model_zoo/tree/develop/models/intel/bert-large-uncased-whole-word-masking-squad-int8-0001) | BERT-large |question / answer |384| -| [deeplabv3-TF](https://github.com/opencv/open_model_zoo/tree/master/models/public/deeplabv3) | DeepLab v3 Tf |semantic segmentation | 513x513 | +| [bert-large-uncased-whole-word-masking-squad](https://github.com/openvinotoolkit/open_model_zoo/tree/develop/models/intel/bert-large-uncased-whole-word-masking-squad-int8-0001) | BERT-large |question / answer |384| +| [deeplabv3-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/deeplabv3) | DeepLab v3 Tf |semantic segmentation | 513x513 | | [densenet-121-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/densenet-121-tf) | Densenet-121 Tf |classification | 224x224 | -| [facenet-20180408-102900-TF](https://github.com/opencv/open_model_zoo/tree/master/models/public/facenet-20180408-102900) | FaceNet TF | face recognition | 160x160 | -| [faster_rcnn_resnet50_coco-TF](https://github.com/opencv/open_model_zoo/tree/master/models/public/faster_rcnn_resnet50_coco) | Faster RCNN Tf | object detection | 600x1024 | +| [facenet-20180408-102900-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/facenet-20180408-102900) | FaceNet TF | face recognition | 160x160 | +| [faster_rcnn_resnet50_coco-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/faster_rcnn_resnet50_coco) | Faster RCNN Tf | object detection | 600x1024 | | [googlenet-v1-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/googlenet-v1-tf) | GoogLeNet_ILSVRC-2012 | classification | 224x224 | -| [inception-v3-TF](https://github.com/opencv/open_model_zoo/tree/master/models/public/googlenet-v3) | Inception v3 Tf | classification | 299x299 | -| [mobilenet-ssd-CF](https://github.com/opencv/open_model_zoo/tree/master/models/public/mobilenet-ssd) | SSD (MobileNet)_COCO-2017_Caffe | object detection | 300x300 | +| [inception-v3-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/googlenet-v3) | Inception v3 Tf | classification | 299x299 | +| [mobilenet-ssd-CF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/mobilenet-ssd) | SSD (MobileNet)_COCO-2017_Caffe | object detection | 300x300 | | [mobilenet-v1-1.0-224-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/mobilenet-v1-1.0-224-tf) | MobileNet v1 Tf | classification | 224x224 | -| [mobilenet-v2-1.0-224-TF](https://github.com/opencv/open_model_zoo/tree/master/models/public/mobilenet-v2-1.0-224) | MobileNet v2 Tf | classification | 224x224 | +| [mobilenet-v2-1.0-224-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/mobilenet-v2-1.0-224) | MobileNet v2 Tf | classification | 224x224 | | [mobilenet-v2-pytorch](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/mobilenet-v2-pytorch ) | Mobilenet V2 PyTorch | classification | 224x224 | -| [resnet-18-pytorch](https://github.com/opencv/open_model_zoo/tree/master/models/public/resnet-18-pytorch) | ResNet-18 PyTorch | classification | 224x224 | +| [resnet-18-pytorch](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-18-pytorch) | ResNet-18 PyTorch | classification | 224x224 | | [resnet-50-pytorch](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-pytorch) | ResNet-50 v1 PyTorch | classification | 224x224 | -| [resnet-50-TF](https://github.com/opencv/open_model_zoo/tree/master/models/public/resnet-50-tf) | ResNet-50_v1_ILSVRC-2012 | classification | 224x224 | -| [se-resnext-50-CF](https://github.com/opencv/open_model_zoo/tree/master/models/public/se-resnext-50) | Se-ResNext-50_ILSVRC-2012_Caffe | classification | 224x224 | -| [squeezenet1.1-CF](https://github.com/opencv/open_model_zoo/tree/master/models/public/squeezenet1.1) | SqueezeNet_v1.1_ILSVRC-2012_Caffe | classification | 227x227 | -| [ssd300-CF](https://github.com/opencv/open_model_zoo/tree/master/models/public/ssd300) | SSD (VGG-16)_VOC-2007_Caffe | object detection | 300x300 | +| [resnet-50-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-tf) | ResNet-50_v1_ILSVRC-2012 | classification | 224x224 | +| [se-resnext-50-CF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/se-resnext-50) | Se-ResNext-50_ILSVRC-2012_Caffe | classification | 224x224 | +| [squeezenet1.1-CF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/squeezenet1.1) | SqueezeNet_v1.1_ILSVRC-2012_Caffe | classification | 227x227 | +| [ssd300-CF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/ssd300) | SSD (VGG-16)_VOC-2007_Caffe | object detection | 300x300 | | [yolo_v3-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/yolo-v3-tf) | TF Keras YOLO v3 Modelset | object detection | 300x300 | | [yolo_v4-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/yolo-v4-tf) | Yolo-V4 TF | object detection | 608x608 | | [ssd_mobilenet_v1_coco-TF](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/ssd_mobilenet_v1_coco) | ssd_mobilenet_v1_coco | object detection | 300x300 | diff --git a/docs/doxygen/doxygen-ignore.txt b/docs/doxygen/doxygen-ignore.txt index 0be7a70dc06342..7f963ac63e71de 100644 --- a/docs/doxygen/doxygen-ignore.txt +++ b/docs/doxygen/doxygen-ignore.txt @@ -22,6 +22,8 @@ inference-engine/include/vpu/vpu_config.hpp inference-engine/include/vpu/vpu_plugin_config.hpp openvino/docs/benchmarks/performance_int8_vs_fp32.md openvino/docs/get_started/get_started_macos.md +openvino/docs/optimization_guide/dldt_optimization_guide.md +openvino/docs/IE_DG/ShapeInference.md inference-engine/include/details/ie_so_pointer.hpp inference-engine/include/ie_compound_blob.h inference-engine/include/ie_data.h diff --git a/docs/doxygen/ie_docs.xml b/docs/doxygen/ie_docs.xml index f287487913d56a..19a87a1e11e97c 100644 --- a/docs/doxygen/ie_docs.xml +++ b/docs/doxygen/ie_docs.xml @@ -41,6 +41,7 @@ limitations under the License. + @@ -59,6 +60,8 @@ limitations under the License. + + @@ -88,6 +91,7 @@ limitations under the License. + @@ -100,6 +104,8 @@ limitations under the License. + + @@ -128,7 +134,7 @@ limitations under the License. - + @@ -157,6 +163,7 @@ limitations under the License. + @@ -186,11 +193,13 @@ limitations under the License. + + @@ -276,6 +285,7 @@ limitations under the License. + @@ -286,14 +296,11 @@ limitations under the License. - - - @@ -303,11 +310,10 @@ limitations under the License. - + - diff --git a/docs/get_started/get_started_linux.md b/docs/get_started/get_started_linux.md index d64d63ed2fccf9..b7b8bd470693ad 100644 --- a/docs/get_started/get_started_linux.md +++ b/docs/get_started/get_started_linux.md @@ -227,7 +227,7 @@ You must have a model that is specific for you inference task. Example model typ - Custom (Often based on SSD) Options to find a model suitable for the OpenVINO™ toolkit are: -- Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/opencv/open_model_zoo) using [Model Downloader tool](@ref omz_tools_downloader). +- Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/openvinotoolkit/open_model_zoo) using [Model Downloader tool](@ref omz_tools_downloader). - Download from GitHub*, Caffe* Zoo, TensorFlow* Zoo, etc. - Train your own model. diff --git a/docs/get_started/get_started_macos.md b/docs/get_started/get_started_macos.md index a15240a1c9b9c4..c58cd418bcf698 100644 --- a/docs/get_started/get_started_macos.md +++ b/docs/get_started/get_started_macos.md @@ -211,7 +211,7 @@ You must have a model that is specific for you inference task. Example model typ - Custom (Often based on SSD) Options to find a model suitable for the OpenVINO™ toolkit are: -- Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/opencv/open_model_zoo) using the [Model Downloader tool](@ref omz_tools_downloader). +- Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/openvinotoolkit/open_model_zoo) using the [Model Downloader tool](@ref omz_tools_downloader). - Download from GitHub*, Caffe* Zoo, TensorFlow* Zoo, and other resources. - Train your own model. diff --git a/docs/get_started/get_started_raspbian.md b/docs/get_started/get_started_raspbian.md index 5f3baf87d2f638..d810958d723f4c 100644 --- a/docs/get_started/get_started_raspbian.md +++ b/docs/get_started/get_started_raspbian.md @@ -13,7 +13,7 @@ On Raspbian* OS, the OpenVINO™ toolkit consists of the following components: > **NOTE**: > * The OpenVINO™ package for Raspberry* does not include the [Model Optimizer](../MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). To convert models to Intermediate Representation (IR), you need to install it separately to your host machine. -> * The package does not include the Open Model Zoo demo applications. You can download them separately from the [Open Models Zoo repository](https://github.com/opencv/open_model_zoo). +> * The package does not include the Open Model Zoo demo applications. You can download them separately from the [Open Models Zoo repository](https://github.com/openvinotoolkit/open_model_zoo). In addition, [code samples](../IE_DG/Samples_Overview.md) are provided to help you get up and running with the toolkit. @@ -43,7 +43,7 @@ The primary tools for deploying your models and applications are installed to th The OpenVINO™ workflow on Raspbian* OS is as follows: 1. **Get a pre-trained model** for your inference task. If you want to use your model for inference, the model must be converted to the `.bin` and `.xml` Intermediate Representation (IR) files, which are used as input by Inference Engine. On Raspberry PI, OpenVINO™ toolkit includes only the Inference Engine module. The Model Optimizer is not supported on this platform. To get the optimized models you can use one of the following options: - * Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/opencv/open_model_zoo) using [Model Downloader tool](@ref omz_tools_downloader). + * Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/openvinotoolkit/open_model_zoo) using [Model Downloader tool](@ref omz_tools_downloader).
For more information on pre-trained models, see [Pre-Trained Models Documentation](@ref omz_models_group_intel) * Convert a model using the Model Optimizer from a full installation of Intel® Distribution of OpenVINO™ toolkit on one of the supported platforms. Installation instructions are available: diff --git a/docs/get_started/get_started_windows.md b/docs/get_started/get_started_windows.md index 253af476efb186..fa6680d30b92df 100644 --- a/docs/get_started/get_started_windows.md +++ b/docs/get_started/get_started_windows.md @@ -211,7 +211,7 @@ You must have a model that is specific for you inference task. Example model typ - Custom (Often based on SSD) Options to find a model suitable for the OpenVINO™ toolkit are: -- Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/opencv/open_model_zoo) using the [Model Downloader tool](@ref omz_tools_downloader). +- Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/openvinotoolkit/open_model_zoo) using the [Model Downloader tool](@ref omz_tools_downloader). - Download from GitHub*, Caffe* Zoo, TensorFlow* Zoo, and other resources. - Train your own model. diff --git a/docs/img/caching_enabled.png b/docs/img/caching_enabled.png new file mode 100644 index 00000000000000..f8a898764e17ee --- /dev/null +++ b/docs/img/caching_enabled.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488a7a47e5086a6868c22219bc9d58a3508059e5a1dc470f2653a12552dea82f +size 36207 diff --git a/docs/img/caching_times.png b/docs/img/caching_times.png new file mode 100644 index 00000000000000..11d9c8b088f9f6 --- /dev/null +++ b/docs/img/caching_times.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eed189f9cb3d30fe13b4ba4515edd4e6da5d01545660e65fa8a33d945967281 +size 28894 diff --git a/docs/index.md b/docs/index.md index ca6a1fa6b2ecdb..76da992f162087 100644 --- a/docs/index.md +++ b/docs/index.md @@ -94,12 +94,12 @@ Intel® Distribution of OpenVINO™ toolkit includes the following components: - [Open Model Zoo](@ref omz_models_group_intel) - [Demos](@ref omz_demos): Console applications that provide robust application templates to help you implement specific deep learning scenarios. - Additional Tools: A set of tools to work with your models including [Accuracy Checker Utility](@ref omz_tools_accuracy_checker) and [Model Downloader](@ref omz_tools_downloader). - - [Documentation for Pretrained Models](@ref omz_models_group_intel): Documentation for pre-trained models that are available in the [Open Model Zoo repository](https://github.com/opencv/open_model_zoo). -- Deep Learning Streamer (DL Streamer): Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. DL Streamer can be installed by the Intel® Distribution of OpenVINO™ toolkit installer. Its open-source version is available on [GitHub](https://github.com/opencv/gst-video-analytics). For the DL Streamer documentation, see: + - [Documentation for Pretrained Models](@ref omz_models_group_intel): Documentation for pre-trained models that are available in the [Open Model Zoo repository](https://github.com/openvinotoolkit/open_model_zoo). +- Deep Learning Streamer (DL Streamer): Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. DL Streamer can be installed by the Intel® Distribution of OpenVINO™ toolkit installer. Its open-source version is available on [GitHub](https://github.com/openvinotoolkit/dlstreamer_gst). For the DL Streamer documentation, see: - [DL Streamer Samples](@ref gst_samples_README) - [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/) - - [Elements](https://github.com/opencv/gst-video-analytics/wiki/Elements) - - [Tutorial](https://github.com/opencv/gst-video-analytics/wiki/DL%20Streamer%20Tutorial) + - [Elements](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/Elements) + - [Tutorial](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/DL-Streamer-Tutorial) - [OpenCV](https://docs.opencv.org/master/) : OpenCV* community version compiled for Intel® hardware - [Intel® Media SDK](https://software.intel.com/en-us/media-sdk) (in Intel® Distribution of OpenVINO™ toolkit for Linux only) diff --git a/docs/install_guides/installing-openvino-apt.md b/docs/install_guides/installing-openvino-apt.md index 665186969912da..982d71102b37a3 100644 --- a/docs/install_guides/installing-openvino-apt.md +++ b/docs/install_guides/installing-openvino-apt.md @@ -2,7 +2,7 @@ This guide provides installation steps for Intel® Distribution of OpenVINO™ toolkit for Linux* distributed through the APT repository. -> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/en-us/license/eula-for-intel-software-development-products). Please, review the content inside the `/licensing` folder for more details. +> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/content/dam/develop/external/us/en/documents/intel-openvino-license-agreements.pdf). Please, review the content inside the `/licensing` folder for more details. > **NOTE**: Intel® Graphics Compute Runtime for OpenCL™ is not a part of OpenVINO™ APT distribution. You can install it from the [Intel® Graphics Compute Runtime for OpenCL™ GitHub repo](https://github.com/intel/compute-runtime). @@ -14,7 +14,7 @@ The following components are installed with the OpenVINO runtime package: |-----------|------------| | [Inference Engine](../IE_DG/Deep_Learning_Inference_Engine_DevGuide.md)| The engine that runs a deep learning model. It includes a set of libraries for an easy inference integration into your applications. | | [OpenCV*](https://docs.opencv.org/master/) | OpenCV* community version compiled for Intel® hardware. | -| Deep Learning Streamer (DL Streamer) | Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/opencv/gst-video-analytics/wiki/Elements), [Tutorial](https://github.com/opencv/gst-video-analytics/wiki/DL%20Streamer%20Tutorial). | +| Deep Learning Streamer (DL Streamer) | Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/Elements), [Tutorial](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/DL-Streamer-Tutorial). | ## Included with Developer Package @@ -28,8 +28,8 @@ The following components are installed with the OpenVINO developer package: | [Sample Applications](../IE_DG/Samples_Overview.md) | A set of simple console applications demonstrating how to use the Inference Engine in your applications. | | [Demo Applications](@ref omz_demos) | A set of console applications that demonstrate how you can use the Inference Engine in your applications to solve specific use cases. | | Additional Tools | A set of tools to work with your models including [Accuracy Checker utility](@ref omz_tools_accuracy_checker), [Post-Training Optimization Tool Guide](@ref pot_README), [Model Downloader](@ref omz_tools_downloader) and other | -| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/opencv/open_model_zoo). | -| Deep Learning Streamer (DL Streamer) | Streaming analytics framework, based on GStreamer\*, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/opencv/gst-video-analytics/wiki/Elements), [Tutorial](https://github.com/opencv/gst-video-analytics/wiki/DL%20Streamer%20Tutorial). | +| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/openvinotoolkit/open_model_zoo). | +| Deep Learning Streamer (DL Streamer) | Streaming analytics framework, based on GStreamer\*, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/Elements), [Tutorial](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/DL-Streamer-Tutorial). | ## Set up the Repository ### Install the GPG key for the repository diff --git a/docs/install_guides/installing-openvino-linux.md b/docs/install_guides/installing-openvino-linux.md index a78fa8fc43d7a1..4a7dd77c506ba0 100644 --- a/docs/install_guides/installing-openvino-linux.md +++ b/docs/install_guides/installing-openvino-linux.md @@ -28,8 +28,8 @@ The Intel® Distribution of OpenVINO™ toolkit for Linux\*: | [Inference Engine Code Samples](../IE_DG/Samples_Overview.md) | A set of simple console applications demonstrating how to utilize specific OpenVINO capabilities in an application and how to perform specific tasks, such as loading a model, running inference, querying specific device capabilities, and more. | | [Demo Applications](@ref omz_demos) | A set of simple console applications that provide robust application templates to help you implement specific deep learning scenarios. | | Additional Tools | A set of tools to work with your models including [Accuracy Checker utility](@ref omz_tools_accuracy_checker), [Post-Training Optimization Tool Guide](@ref pot_README), [Model Downloader](@ref omz_tools_downloader) and other | -| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/opencv/open_model_zoo). | -| Deep Learning Streamer (DL Streamer) | Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/opencv/gst-video-analytics/wiki/Elements), [Tutorial](https://github.com/opencv/gst-video-analytics/wiki/DL%20Streamer%20Tutorial). | +| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/openvinotoolkit/open_model_zoo). | +| Deep Learning Streamer (DL Streamer) | Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/Elements), [Tutorial](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/DL-Streamer-Tutorial). | **Could Be Optionally Installed** diff --git a/docs/install_guides/installing-openvino-macos.md b/docs/install_guides/installing-openvino-macos.md index d878eac5c3a84a..e4a225bdab05bd 100644 --- a/docs/install_guides/installing-openvino-macos.md +++ b/docs/install_guides/installing-openvino-macos.md @@ -29,7 +29,7 @@ The following components are installed by default: | [Sample Applications](../IE_DG/Samples_Overview.md) | A set of simple console applications demonstrating how to use the Inference Engine in your applications. | | [Demos](@ref omz_demos) | A set of console applications that demonstrate how you can use the Inference Engine in your applications to solve specific use-cases | | Additional Tools | A set of tools to work with your models including [Accuracy Checker utility](@ref omz_tools_accuracy_checker), [Post-Training Optimization Tool Guide](@ref pot_README), [Model Downloader](@ref omz_tools_downloader) and other | -| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/opencv/open_model_zoo) | +| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/openvinotoolkit/open_model_zoo) | **Could Be Optionally Installed** diff --git a/docs/install_guides/installing-openvino-raspbian.md b/docs/install_guides/installing-openvino-raspbian.md index 61cff12e424760..338beda73c813a 100644 --- a/docs/install_guides/installing-openvino-raspbian.md +++ b/docs/install_guides/installing-openvino-raspbian.md @@ -28,7 +28,7 @@ The OpenVINO toolkit for Raspbian OS is an archive with pre-installed header fil > **NOTE**: > * The package does not include the [Model Optimizer](../MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). To convert models to Intermediate Representation (IR), you need to install it separately to your host machine. -> * The package does not include the Open Model Zoo demo applications. You can download them separately from the [Open Models Zoo repository](https://github.com/opencv/open_model_zoo). +> * The package does not include the Open Model Zoo demo applications. You can download them separately from the [Open Models Zoo repository](https://github.com/openvinotoolkit/open_model_zoo). ## Development and Target Platforms @@ -166,7 +166,7 @@ Read the next topic if you want to learn more about OpenVINO workflow for Raspbe If you want to use your model for inference, the model must be converted to the .bin and .xml Intermediate Representation (IR) files that are used as input by Inference Engine. OpenVINO™ toolkit support on Raspberry Pi only includes the Inference Engine module of the Intel® Distribution of OpenVINO™ toolkit. The Model Optimizer is not supported on this platform. To get the optimized models you can use one of the following options: -* Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/opencv/open_model_zoo) using [Model Downloader tool](@ref omz_tools_downloader). +* Download public and Intel's pre-trained models from the [Open Model Zoo](https://github.com/openvinotoolkit/open_model_zoo) using [Model Downloader tool](@ref omz_tools_downloader). For more information on pre-trained models, see [Pre-Trained Models Documentation](@ref omz_models_group_intel) diff --git a/docs/install_guides/installing-openvino-windows.md b/docs/install_guides/installing-openvino-windows.md index 1a1a31a07c61fe..8e3b6ece81fb4d 100644 --- a/docs/install_guides/installing-openvino-windows.md +++ b/docs/install_guides/installing-openvino-windows.md @@ -62,7 +62,7 @@ The following components are installed by default: |[Inference Engine Samples](../IE_DG/Samples_Overview.md) |A set of simple console applications demonstrating how to use Intel's Deep Learning Inference Engine in your applications. | | [Demos](@ref omz_demos) | A set of console applications that demonstrate how you can use the Inference Engine in your applications to solve specific use-cases | | Additional Tools | A set of tools to work with your models including [Accuracy Checker utility](@ref omz_tools_accuracy_checker), [Post-Training Optimization Tool Guide](@ref pot_README), [Model Downloader](@ref omz_tools_downloader) and other | -| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/opencv/open_model_zoo) | +| [Documentation for Pre-Trained Models ](@ref omz_models_group_intel) | Documentation for the pre-trained models available in the [Open Model Zoo repo](https://github.com/openvinotoolkit/open_model_zoo) | **Could Be Optionally Installed** @@ -248,8 +248,8 @@ Or proceed to the Get Started to get started with run > **NOTE**: These steps are required only if you want to use an Intel® integrated GPU. -If your applications offload computation to **Intel® Integrated Graphics**, you must have the latest version of Intel Graphics Driver for Windows installed for your hardware. -[Download and install a higher version](http://downloadcenter.intel.com/product/80939/Graphics-Drivers). +If your applications offload computation to **Intel® Integrated Graphics**, you must have the Intel Graphics Driver for Windows installed for your hardware. +[Download and install the recommended version](https://downloadcenter.intel.com/download/30079/Intel-Graphics-Windows-10-DCH-Drivers). To check if you have this driver installed: @@ -265,8 +265,6 @@ To check if you have this driver installed: ![](../img/DeviceDriverVersion.PNG) -> **NOTE**: To use the **Intel® Iris® Xe MAX Graphics**, see the [Drivers & Software](https://downloadcenter.intel.com/download/29993/Intel-Iris-Xe-MAX-Dedicated-Graphics-Drivers?product=80939) page for driver downloads and installation instructions. - You are done updating your device driver and are ready to use your GPU. Proceed to the Get Started to get started with running code samples and demo applications. ### Optional: Additional Installation Steps for the Intel® Vision Accelerator Design with Intel® Movidius™ VPUs diff --git a/docs/install_guides/installing-openvino-yum.md b/docs/install_guides/installing-openvino-yum.md index 27e464d1b84bd5..ae34d202293b4f 100644 --- a/docs/install_guides/installing-openvino-yum.md +++ b/docs/install_guides/installing-openvino-yum.md @@ -2,7 +2,7 @@ This guide provides installation steps for the Intel® Distribution of OpenVINO™ toolkit for Linux* distributed through the YUM repository. -> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/en-us/license/eula-for-intel-software-development-products). Please, review the content inside the `/licensing` folder for more details. +> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/content/dam/develop/external/us/en/documents/intel-openvino-license-agreements.pdf). Please, review the content inside the `/licensing` folder for more details. > **NOTE**: Intel® Graphics Compute Runtime for OpenCL™ is not a part of OpenVINO™ YUM distribution. You can install it from the [Intel® Graphics Compute Runtime for OpenCL™ GitHub repo](https://github.com/intel/compute-runtime). @@ -16,7 +16,7 @@ The following components are installed with the OpenVINO runtime package: |-----------|------------| | [Inference Engine](../IE_DG/Deep_Learning_Inference_Engine_DevGuide.md)| The engine that runs a deep learning model. It includes a set of libraries for an easy inference integration into your applications. | | [OpenCV*](https://docs.opencv.org/master/) | OpenCV* community version compiled for Intel® hardware. | -| Deep Learning Stream (DL Streamer) | Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/opencv/gst-video-analytics/wiki/Elements), [Tutorial](https://github.com/opencv/gst-video-analytics/wiki/DL%20Streamer%20Tutorial). | +| Deep Learning Stream (DL Streamer) | Streaming analytics framework, based on GStreamer, for constructing graphs of media analytics components. For the DL Streamer documentation, see [DL Streamer Samples](@ref gst_samples_README), [API Reference](https://openvinotoolkit.github.io/dlstreamer_gst/), [Elements](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/Elements), [Tutorial](https://github.com/openvinotoolkit/dlstreamer_gst/wiki/DL-Streamer-Tutorial). | ## Set up the Repository diff --git a/docs/model_server/README.md b/docs/model_server/README.md index ae5d03914ab347..e6c7144f3cb6c9 100644 --- a/docs/model_server/README.md +++ b/docs/model_server/README.md @@ -1,29 +1,29 @@ # OpenVINO™ Model Server {#openvino_docs_ovms} -OpenVINO™ Model Server (OVMS) is a scalable, high-performance solution for serving machine learning models optimized for Intel® architectures. -The server provides an inference service via gRPC or REST API - making it easy to deploy new algorithms and AI experiments using the same -architecture as [TensorFlow* Serving](https://github.com/tensorflow/serving) for any models trained in a framework that is supported -by [OpenVINO](https://software.intel.com/en-us/openvino-toolkit). +OpenVINO™ Model Server (OVMS) is a scalable, high-performance solution for serving machine learning models optimized for Intel® architectures. +The server provides an inference service via gRPC or REST API - making it easy to deploy new algorithms and AI experiments using the same +architecture as [TensorFlow* Serving](https://github.com/tensorflow/serving) for any models trained in a framework that is supported +by [OpenVINO](https://software.intel.com/en-us/openvino-toolkit). The server implements gRPC and REST API framework with data serialization and deserialization using TensorFlow Serving API, and OpenVINO™ as the inference execution provider. Model repositories may reside on a locally accessible file system (for example, NFS), Google Cloud Storage\* (GCS), Amazon S3\*, MinIO\*, or Azure Blob Storage\*. - + OVMS is now implemented in C++ and provides much higher scalability compared to its predecessor in the Python version. You can take advantage of all the power of Xeon® CPU capabilities or AI accelerators and expose it over the network interface. Read the [release notes](https://github.com/openvinotoolkit/model_server/releases) to find out what's new in the C++ version. Review the [Architecture Concept](https://github.com/openvinotoolkit/model_server/blob/main/docs/architecture.md) document for more details. -A few key features: +A few key features: - Support for multiple frameworks. Serve models trained in popular formats such as Caffe\*, TensorFlow\*, MXNet\*, and ONNX*. - Deploy new [model versions](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-version-policy) without changing client code. -- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU), -[GPU](../IE_DG/supported_plugins/CL_DNN), and [HDDL](../IE_DG/supported_plugins/HDDL). +- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU.md), +[GPU](../IE_DG/supported_plugins/GPU.md), and [HDDL](../IE_DG/supported_plugins/HDDL.md). - The server can be enabled both on [Bare Metal Hosts](https://github.com/openvinotoolkit/model_server/blob/main/docs/host.md) or in [Docker* containers](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md). -- [Kubernetes deployments](https://github.com/openvinotoolkit/model_server/blob/main/deploy). The server can be deployed in a Kubernetes cluster allowing the inference service to scale horizontally and ensure high availability. -- [Model reshaping](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-reshaping). The server supports reshaping models in runtime. +- [Kubernetes deployments](https://github.com/openvinotoolkit/model_server/blob/main/deploy). The server can be deployed in a Kubernetes cluster allowing the inference service to scale horizontally and ensure high availability. +- [Model reshaping](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-reshaping). The server supports reshaping models in runtime. - [Model ensemble](https://github.com/openvinotoolkit/model_server/blob/main/docs/ensemble_scheduler.md) (preview). Connect multiple models to deploy complex processing solutions and reduce overhead of sending data back and forth. > **NOTE**: OVMS has been tested on CentOS\* and Ubuntu\*. Publicly released [Docker images](https://hub.docker.com/r/openvino/model_server) are based on CentOS. @@ -68,30 +68,30 @@ For more detailed guides on using the Model Server in various scenarios, visit t ## API Documentation -### GRPC +### GRPC -OpenVINO™ Model Server gRPC API is documented in the proto buffer files in [tensorflow_serving_api](https://github.com/tensorflow/serving/tree/r2.2/tensorflow_serving/apis). +OpenVINO™ Model Server gRPC API is documented in the proto buffer files in [tensorflow_serving_api](https://github.com/tensorflow/serving/tree/r2.2/tensorflow_serving/apis). -> **NOTE:** The implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. +> **NOTE:** The implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. > These are the most generic function calls and should address most of the usage scenarios. -[Predict proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/predict.proto) defines two message specifications: `PredictRequest` and `PredictResponse` used while calling Prediction endpoint. -* `PredictRequest` specifies information about the model spec, that is name and version, and a map of input data serialized via +[Predict proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/predict.proto) defines two message specifications: `PredictRequest` and `PredictResponse` used while calling Prediction endpoint. +* `PredictRequest` specifies information about the model spec, that is name and version, and a map of input data serialized via [TensorProto](https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/core/framework/tensor.proto) to a string format. -* `PredictResponse` includes a map of outputs serialized by +* `PredictResponse` includes a map of outputs serialized by [TensorProto](https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/core/framework/tensor.proto) and information about the used model spec. - + [Get Model Metadata proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/get_model_metadata.proto) defines three message definitions used while calling Metadata endpoint: `SignatureDefMap`, `GetModelMetadataRequest`, `GetModelMetadataResponse`. A function call `GetModelMetadata` accepts model spec information as input and returns Signature Definition content in the format similar to TensorFlow Serving. [Get Model Status proto](https://github.com/tensorflow/serving/blob/r2.2/tensorflow_serving/apis/get_model_status.proto) defines three message definitions used while calling Status endpoint: - `GetModelStatusRequest`, `ModelVersionStatus`, `GetModelStatusResponse` that report all exposed versions including their state in their lifecycle. + `GetModelStatusRequest`, `ModelVersionStatus`, `GetModelStatusResponse` that report all exposed versions including their state in their lifecycle. Refer to the [example client code](https://github.com/openvinotoolkit/model_server/blob/main/example_client) to learn how to use this API and submit the requests using the gRPC interface. -Using the gRPC interface is recommended for optimal performance due to its faster implementation of input data deserialization. It enables you to achieve lower latency, especially with larger input messages like images. +Using the gRPC interface is recommended for optimal performance due to its faster implementation of input data deserialization. It enables you to achieve lower latency, especially with larger input messages like images. ### REST @@ -99,9 +99,9 @@ OpenVINO™ Model Server RESTful API follows the documentation from the [Ten Both row and column format of the requests are implemented. -> **NOTE**: Just like with gRPC, only the implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. +> **NOTE**: Just like with gRPC, only the implementations for `Predict`, `GetModelMetadata`, and `GetModelStatus` function calls are currently available. -Only the numerical data types are supported. +Only the numerical data types are supported. Review the exemplary clients below to find out more how to connect and run inference requests. @@ -110,9 +110,9 @@ REST API is recommended when the primary goal is in reducing the number of clien ## Known Limitations -* Currently, `Predict`, `GetModelMetadata`, and `GetModelStatus` calls are implemented using the TensorFlow Serving API. +* Currently, `Predict`, `GetModelMetadata`, and `GetModelStatus` calls are implemented using the TensorFlow Serving API. * `Classify`, `Regress`, and `MultiInference` are not included. -* `Output_filter` is not effective in the `Predict` call. All outputs defined in the model are returned to the clients. +* `Output_filter` is not effective in the `Predict` call. All outputs defined in the model are returned to the clients. ## OpenVINO Model Server Contribution Policy diff --git a/docs/ops/activation/Swish_4.md b/docs/ops/activation/Swish_4.md index 1a8b7d1b51a4f9..04f17390a16783 100644 --- a/docs/ops/activation/Swish_4.md +++ b/docs/ops/activation/Swish_4.md @@ -9,7 +9,8 @@ **Detailed description** *Swish* operation is introduced in this [article](https://arxiv.org/abs/1710.05941). -It performs element-wise activation function on a given input tensor, based on the following mathematical formula: + +*Swish* is a smooth, non-monotonic function. The non-monotonicity property of *Swish* distinguishes itself from most common activation functions. It performs element-wise activation function on a given input tensor, based on the following mathematical formula: \f[ Swish(x) = x\cdot \sigma(\beta x) = x \left(1 + e^{-(\beta x)}\right)^{-1} diff --git a/docs/ops/arithmetic/Ceiling_1.md b/docs/ops/arithmetic/Ceiling_1.md index 588b5ff6842f55..4d4cfeb945002e 100644 --- a/docs/ops/arithmetic/Ceiling_1.md +++ b/docs/ops/arithmetic/Ceiling_1.md @@ -2,31 +2,31 @@ **Versioned name**: *Ceiling-1* -**Category**: Arithmetic unary operation +**Category**: Arithmetic unary operation **Short description**: *Ceiling* performs element-wise ceiling operation with given tensor. -**Attributes**: +**Detailed description**: For each element from the input tensor calculates corresponding +element in the output tensor with the following formula: - No attributes available. +\f[ +a_{i} = ceiling(a_{i}) +\f] + +**Attributes**: *Ceiling* operation has no attributes. **Inputs** -* **1**: An tensor of type T. **Required.** +* **1**: A tensor of type *T* and arbitrary shape. **Required.** **Outputs** -* **1**: The result of element-wise ceiling operation. A tensor of type T. +* **1**: The result of element-wise ceiling operation. A tensor of type *T*. **Types** * *T*: any numeric type. -*Ceiling* does the following with the input tensor *a*: - -\f[ -a_{i} = ceiling(a_{i}) -\f] **Examples** diff --git a/docs/ops/arithmetic/Log_1.md b/docs/ops/arithmetic/Log_1.md index 6f33b002b693b7..f13149198210a4 100644 --- a/docs/ops/arithmetic/Log_1.md +++ b/docs/ops/arithmetic/Log_1.md @@ -6,28 +6,28 @@ **Short description**: *Log* performs element-wise natural logarithm operation with given tensor. +**Detailed description**: *Log* does the following with the input tensor *a*: + +\f[ +a_{i} = log(a_{i}) +\f] + **Attributes**: No attributes available. **Inputs** -* **1**: An tensor of type T. **Required.** +* **1**: An tensor of type T and arbitrary shape. **Required.** **Outputs** -* **1**: The result of element-wise log operation. A tensor of type T. +* **1**: The result of element-wise log operation. A tensor of type T and the same shape as input. **Types** * *T*: any numeric type. -*Log* does the following with the input tensor *a*: - -\f[ -a_{i} = log(a_{i}) -\f] - **Examples** *Example 1* diff --git a/docs/ops/arithmetic/Negative_1.md b/docs/ops/arithmetic/Negative_1.md index 2e17112e7bcc51..997342c2d05da1 100644 --- a/docs/ops/arithmetic/Negative_1.md +++ b/docs/ops/arithmetic/Negative_1.md @@ -2,35 +2,33 @@ **Versioned name**: *Negative-1* -**Category**: Arithmetic unary operation +**Category**: Arithmetic unary operation -**Short description**: *Negative* performs element-wise negative operation with given tensor. +**Short description**: *Negative* performs element-wise negative operation on a given input tensor. -**Attributes**: +**Detailed description** - No attributes available. +*Negative* performs element-wise negative operation on a given input tensor, based on the following mathematical formula: + +\f[ +a_{i} = -a_{i} +\f] + +**Attributes**: *Negative* operation has no attributes. **Inputs** -* **1**: An tensor of type T. **Required.** +* **1**: A tensor of type *T* and arbitrary shape. **Required.** **Outputs** -* **1**: The result of element-wise negative operation. A tensor of type T. +* **1**: The result of element-wise *Negative* operation applied to the input tensor. A tensor of type *T* and the same shape as input tensor. **Types** -* *T*: any numeric type. - -*Negative* does the following with the input tensor *a*: - -\f[ -a_{i} = -a_{i} -\f] - -**Examples** +* *T*: any supported signed numeric type. -*Example 1* +**Example** ```xml @@ -47,4 +45,4 @@ a_{i} = -a_{i} -``` \ No newline at end of file +``` diff --git a/docs/ops/convolution/DeformableConvolution_1.md b/docs/ops/convolution/DeformableConvolution_1.md index 2cba8d84039fe9..612d3c419d4ec2 100644 --- a/docs/ops/convolution/DeformableConvolution_1.md +++ b/docs/ops/convolution/DeformableConvolution_1.md @@ -8,6 +8,26 @@ **Detailed description**: *Deformable Convolution* is similar to regular *Convolution* but its receptive field is deformed because of additional spatial offsets used during input sampling. More thorough explanation can be found in [Deformable Convolutions Demystified](https://towardsdatascience.com/deformable-convolutions-demystified-2a77498699e8) and [Deformable Convolutional Networks](https://arxiv.org/abs/1703.06211). +Output is calculated using the following formula: + + \f[ + + y(p) = \sum_{k = 1}^{K}w_{k}x(p + p_{k} + {\Delta}p_{k}) + + \f] + +Where +* K is a number of sampling locations, e.g. for kernel 3x3 and dilation = 1, K = 9 + +* \f$x(p)\f$ and \f$y(p)\f$ denote the features at location p from the input feature maps x and output feature maps y + +* \f$w_{k}\f$ is the weight for k-th location. + +* \f$p_{k}\f$ is pre-specified offset for the k-th location, e.g. K = 9 and +\f$p_{k} \in \{(-1, -1),(-1, 0), . . . ,(1, 1)\}\f$ + +* \f${\Delta}p_{k}\f$ is the learnable offset for the k-th location. + **Attributes**: * *strides* diff --git a/docs/ops/convolution/DeformableConvolution_8.md b/docs/ops/convolution/DeformableConvolution_8.md new file mode 100644 index 00000000000000..cf59584a5f4104 --- /dev/null +++ b/docs/ops/convolution/DeformableConvolution_8.md @@ -0,0 +1,168 @@ +## DeformableConvolution {#openvino_docs_ops_convolution_DeformableConvolution_8} + +**Versioned name**: *DeformableConvolution-8* + +**Category**: Convolution + +**Short description**: Computes 2D deformable convolution of input and kernel tensors. + +**Detailed description**: *Deformable Convolution* is similar to regular *Convolution* but its receptive field is deformed because of additional spatial offsets used during input sampling. More thorough explanation can be found in [Deformable Convolutions Demystified](https://towardsdatascience.com/deformable-convolutions-demystified-2a77498699e8), [Deformable Convolutional Networks](https://arxiv.org/abs/1703.06211). + +Modification of DeformableConvolution using modulating scalars is also supported. Please refer to [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/pdf/1811.11168.pdf). + +Output is calculated using the following formula: + + \f[ + + y(p) = \sum_{k = 1}^{K}w_{k}x(p + p_{k} + {\Delta}p_{k}) * {\Delta}m_{k} + + \f] +Where +* K is a number of sampling locations, e.g. for kernel 3x3 and dilation = 1, K = 9 + +* \f$x(p)\f$ and \f$y(p)\f$ denote the features at location p from the input feature maps x and output feature maps y + +* \f$w_{k}\f$ is the weight for k-th location. + +* \f$p_{k}\f$ is pre-specified offset for the k-th location, e.g. K = 9 and +\f$p_{k} \in \{(-1, -1),(-1, 0), . . . ,(1, 1)\}\f$ + +* \f${\Delta}p_{k}\f$ is the learnable offset for the k-th location. + +* \f${\Delta}m_{k}\f$ is the modulation scalar from 0 to 1 for the k-th location. + +**Attributes**: + +* *strides* + + * **Description**: *strides* is a distance (in pixels) to slide the filter on the feature map over the `(y,x)` axes. For example, *strides* equal `2,1` means sliding the filter 2 pixel at a time over height dimension and 1 over width dimension. + * **Range of values**: integer values starting from `0` + * **Type**: `int[]` + * **Default value**: None + * **Required**: *yes* + +* *pads_begin* + + * **Description**: *pads_begin* is a number of pixels to add to the beginning along each axis. For example, *pads_begin* equal `1,2` means adding 1 pixel to the top of the input and 2 to the left of the input. + * **Range of values**: integer values starting from `0` + * **Type**: `int[]` + * **Default value**: None + * **Required**: *yes* + * **Note**: the attribute is ignored when *auto_pad* attribute is specified. + +* *pads_end* + + * **Description**: *pads_end* is a number of pixels to add to the ending along each axis. For example, *pads_end* equal `1,2` means adding 1 pixel to the bottom of the input and 2 to the right of the input. + * **Range of values**: integer values starting from `0` + * **Type**: `int[]` + * **Default value**: None + * **Required**: *yes* + * **Note**: the attribute is ignored when *auto_pad* attribute is specified. + +* *dilations* + + * **Description**: *dilations* denotes the distance in width and height between elements (weights) in the filter. For example, *dilation* equal `1,1` means that all the elements in the filter are neighbors, so it is the same as for the usual convolution. *dilation* equal `2,2` means that all the elements in the filter are matched not to adjacent elements in the input matrix, but to those that are adjacent with distance 1. + * **Range of values**: integer value starting from `0` + * **Type**: `int[]` + * **Default value**: None + * **Required**: *yes* + +* *auto_pad* + + * **Description**: *auto_pad* how the padding is calculated. Possible values: + * *explicit* - use explicit padding values from *pads_begin* and *pads_end*. + * *same_upper* - the input is padded to match the output size. In case of odd padding value an extra padding is added at the end. + * *same_lower* - the input is padded to match the output size. In case of odd padding value an extra padding is added at the beginning. + * *valid* - do not use padding. + * **Type**: `string` + * **Default value**: explicit + * **Required**: *no* + * **Note**: *pads_begin* and *pads_end* attributes are ignored when *auto_pad* is specified. + + +* *group* + + * **Description**: *group* is the number of groups which *output* and *input* should be split into. For example, *group* equal to 1 means that all filters are applied to the whole input (usual convolution), *group* equal to 2 means that both *input* and *output* channels are separated into two groups and the *i-th output* group is connected to the *i-th input* group channel. *group* equal to a number of output feature maps implies depth-wise separable convolution. + * **Range of values**: integer value starting from `1` + * **Type**: `int` + * **Default value**: `1` + * **Required**: *no* + +* *deformable_group* + + * **Description**: *deformable_group* is the number of groups in which *offsets* input and *output* should be split into along the channel axis. Apply the deformable convolution using the i-th part of the offsets part on the i-th out. + * **Range of values**: integer value starting from `1` + * **Type**: `int` + * **Default value**: `1` + * **Required**: *no* + +* *bilinear_interpolation_padding* + + * **Description**: *bilinear_interpolation_padding* is the number of pixels outside of the feature map boundary to apply bilinear interpolation. + * **Range of values**: non-negative integer value + * **Type**: `int` + * **Default value**: `0` + * **Required**: *no* + +**Inputs**: + +* **1**: Input tensor of type *T* and rank 4. Layout is `NCYX` (number of batches, number of channels, spatial axes Y and X). **Required.** + +* **2**: Offsets tensor of type *T* and rank 4. Layout is `NCYX` (number of batches, *deformable_group* \* kernel_Y \* kernel_X \* 2, spatial axes Y and X). **Required.** + +* **3**: Kernel tensor of type *T* and rank 4. Layout is `OIYX` (number of output channels, number of input channels, spatial axes Y and X). **Required.** + +* **4**: ModulationScalars tensor of type *T2* and rank 4, the values are within [0, 1]. Layout is `NCYX` (number of batches, *deformable_group* \* kernel_Y \* kernel_X, spatial axes Y and X). If the input is not provided, the values are assumed to be equal to 1. **Optional.** + + +**Outputs**: + +* **1**: Output tensor of type *T* and rank 4. Layout is `NOYX` (number of batches, number of kernel output channels, spatial axes Y and X). + +**Types**: + +* *T*: Any numeric type. +* *T2*: Any supported floating point. + +**Example** + +2D DeformableConvolution (deformable_group=1) +```xml + + + + + 1 + 4 + 224 + 224 + + + 1 + 50 + 220 + 220 + + + 64 + 4 + 5 + 5 + + + 1 + 25 + 220 + 220 + + + + + 1 + 64 + 220 + 220 + + + +``` diff --git a/docs/ops/detection/RegionYolo_1.md b/docs/ops/detection/RegionYolo_1.md index c4eece6ff8b419..ebcc79c23777cb 100644 --- a/docs/ops/detection/RegionYolo_1.md +++ b/docs/ops/detection/RegionYolo_1.md @@ -6,7 +6,7 @@ **Short description**: *RegionYolo* computes the coordinates of regions with probability for each class. -**Detailed description**: This operation is directly mapped to the original YOLO layer. [Reference](https://arxiv.org/pdf/1612.08242.pdf) +**Detailed description**: This operation is directly mapped to the [YOLO9000: Better, Faster, Stronger](https://arxiv.org/pdf/1612.08242.pdf) paper. **Attributes**: @@ -78,14 +78,17 @@ **Inputs**: -* **1**: `data` - 4D input tensor with floating point elements and shape `[N, C, H, W]`. Required. +* **1**: `data` - 4D tensor of type `T` and shape `[N, C, H, W]`. **Required.** **Outputs**: -* **1**: output tensor of rank 4 or less that codes detected regions. Refer to the original YOLO paper to decode the output as boxes. `anchors` should be used to decode real box coordinates. If `do_softmax` is set to 0, then the output shape is `[N, (classes + coords + 1)*len(mask), H, W]`. If `do_softmax` is set to 1, then output shape is partially flattened and defined in the following way: +* **1**: tensor of type `T` and rank 4 or less that codes detected regions. Refer to the [YOLO9000: Better, Faster, Stronger](https://arxiv.org/pdf/1612.08242.pdf) paper to decode the output as boxes. `anchors` should be used to decode real box coordinates. If `do_softmax` is set to `0`, then the output shape is `[N, (classes + coords + 1) * len(mask), H, W]`. If `do_softmax` is set to `1`, then output shape is partially flattened and defined in the following way: - flat_dim = data.shape[axis] * data.shape[axis+1] * ... * data.shape[end_axis] - output.shape = [data.shape[0], ..., data.shape[axis-1], flat_dim, data.shape[end_axis + 1], ...] + `flat_dim = data.shape[axis] * data.shape[axis+1] * ... * data.shape[end_axis]` + `output.shape = [data.shape[0], ..., data.shape[axis-1], flat_dim, data.shape[end_axis + 1], ...]` + +**Types** +* *T*: any supported floating point type. **Example** diff --git a/docs/ops/infrastructure/Parameter_1.md b/docs/ops/infrastructure/Parameter_1.md index 807a606a375036..879880002e6319 100644 --- a/docs/ops/infrastructure/Parameter_1.md +++ b/docs/ops/infrastructure/Parameter_1.md @@ -11,18 +11,27 @@ * *element_type* * **Description**: the type of element of output tensor - * **Range of values**: u8, u16, u32, u64, i8, i16, i32, i64, f16, f32, boolean, bf16 - * **Type**: string + * **Range of values**: u1, u4, u8, u16, u32, u64, i4, i8, i16, i32, i64, f16, f32, boolean, bf16 + * **Type**: `string` * **Default value**: None - * **Required**: *Yes* + * **Required**: *yes* * *shape* * **Description**: the shape of the output tensor - * **Range of values**: list of non-negative integers, empty list is allowed that means 0D or scalar tensor - * **Type**: int[] + * **Range of values**: list of non-negative integers, empty list is allowed, which means 0D or scalar tensor + * **Type**: `int[]` * **Default value**: None - * **Required**: *Yes* + * **Required**: *yes* + + +**Outputs** + +* **1**: Output tensor of type *T* and shape equal to *shape* attribute. + +**Types** + +* *T*: any type from *element type* values. **Example** @@ -38,4 +47,4 @@ -``` \ No newline at end of file +``` diff --git a/docs/ops/movement/ExtractImagePatches_3.md b/docs/ops/movement/ExtractImagePatches_3.md index 3604d3b49ca19d..5046854ee22269 100644 --- a/docs/ops/movement/ExtractImagePatches_3.md +++ b/docs/ops/movement/ExtractImagePatches_3.md @@ -8,9 +8,7 @@ **Detailed description**: -The *ExtractImagePatches* operation is similar to the TensorFlow* operation [ExtractImagePatches](https://www.tensorflow.org/api_docs/python/tf/image/extract_patches). - -This op extracts patches of shape `sizes` which are `strides` apart in the input image. The output elements are taken from the input at intervals given by the `rate` argument, as in dilated convolutions. +The *ExtractImagePatches* operation extracts patches of shape `sizes` which are `strides` apart in the input image. The output elements are taken from the input at intervals given by the `rate` argument, as in dilated convolutions. The result is a 4D tensor containing image patches with size `size[0] * size[1] * depth` vectorized in the "depth" dimension. @@ -92,20 +90,23 @@ The "auto_pad" attribute has no effect on the size of each patch, it determines Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We use the symbol `x` to mark output patches. 1. `sizes="3,3", strides="5,5", rates="1,1", auto_pad="valid"` +\f[ + \begin{bmatrix} + x & x & x & 4 & 5 & x & x & x & 9 & 10 \\ + x & x & x & 14 & 15 & x & x & x & 19 & 20 \\ + x & x & x & 24 & 25 & x & x & x & 29 & 30 \\ + 31 & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 \\ + 41 & 42 & 43 & 44 & 45 & 46 & 47 & 48 & 49 & 50 \\ + x & x & x & 54 & 55 & x & x & x & 59 & 60 \\ + x & x & x & 64 & 65 & x & x & x & 69 & 70 \\ + x & x & x & 74 & 75 & x & x & x & 79 & 80 \\ + 81 & 82 & 83 & 84 & 85 & 86 & 87 & 88 & 89 & 90 \\ + 91 & 92 & 93 & 94 & 95 & 96 & 79 & 98 & 99 & 100 + \end{bmatrix} +\f] -   x   x   x    4   5   x   x   x   9 10 -   x   x   x  14 15   x   x   x 19 20 -   x   x   x  24 25   x   x   x 29 30 - 31 32 33 34 35 36 37 38 39 40 - 41 42 43 44 45 46 47 48 49 50 -   x   x   x  54 55   x   x   x 59 60 -   x   x   x  64 65   x   x   x 69 70 -   x   x   x  74 75   x   x   x 79 80 - 81 82 83 84 85 86 87 88 89 90 - 91 92 93 94 95 96 97 98 99 100 - output: - +``` [[[[ 1 6] [51 56]] @@ -132,25 +133,28 @@ Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We u [[23 28] [73 78]]]] - +``` output shape: `[1, 9, 2, 2]` 2. `sizes="4,4", strides="8,8", rates="1,1", auto_pad="valid"` - -   x   x   x   x    5   6   7   8   9 10 -   x   x   x   x  15 16 17 18 19 20 -   x   x   x   x  25 26 27 28 29 30 -   x   x   x   x  35 36 37 38 39 40 - 41 42 43 44 45 46 47 48 49 50 - 51 52 53 54 55 56 57 58 59 60 - 61 62 63 64 65 66 67 68 69 70 - 71 72 73 74 75 76 77 78 79 80 - 81 82 83 84 85 86 87 88 89 90 - 91 92 93 94 95 96 97 98 99 100 +\f[ + \begin{bmatrix} + x & x & x & x & 5 & 6 & 7 & 8 & 9 & 10 \\ + x & x & x & x & 15 & 16 & 17 & 18 & 19 & 20 \\ + x & x & x & x & 25 & 26 & 27 & 28 & 29 & 30 \\ + x & x & x & x & 35 & 36 & 37 & 38 & 39 & 40 \\ + 41 & 42 & 43 & 44 & 45 & 46 & 47 & 48 & 49 & 50 \\ + 51 & 52 & 53 & 54 & 55 & 56 & 57 & 58 & 59 & 60 \\ + 61 & 62 & 63 & 64 & 65 & 66 & 67 & 68 & 69 & 70 \\ + 71 & 72 & 73 & 74 & 75 & 76 & 77 & 78 & 79 & 80 \\ + 81 & 82 & 83 & 84 & 85 & 86 & 87 & 88 & 89 & 90 \\ + 91 & 92 & 93 & 94 & 95 & 96 & 79 & 98 & 99 & 100 + \end{bmatrix} +\f] output: - - [[[[ 1]] +``` + [[[[ 1]] [[ 2]] @@ -180,28 +184,30 @@ Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We u [[33]] - [[34]]]] - + [[34]]]] +``` output shape: `[1, 16, 1, 1]` 3. `sizes="4,4", strides="9,9", rates="1,1", auto_pad="same_upper"` - -   x   x   x   x    0   0   0   0   0   x   x   x   x -   x   x   x   x    4   5   6   7   8   x   x   x   x -   x   x   x   x  14 15 16 17 18   x   x   x   x -   x   x   x   x  24 25 26 27 28   x   x   x   x -   0 31 32 33 34 35 36 37 38 39 40   0   0 -   0 41 42 43 44 45 46 47 48 49 50   0   0 -   0 51 52 53 54 55 56 57 58 59 60   0   0 -   0 61 62 63 64 65 66 67 68 69 70   0   0 -   0 71 72 73 74 75 76 77 78 79 80   0   0 -   x   x   x   x  84 85 86 87 88   x   x   x   x -   x   x   x   x  94 95 96 97 98   x   x   x   x -   x   x   x   x    0   0   0   0   0   x   x   x   x -   x   x   x   x    0   0   0   0   0   x   x   x   x - +\f[ + \begin{bmatrix} + x & x & x & x & 0 & 0 & 0 & 0 & 0 & x & x & x & x\\ + x & x & x & x & 4 & 5 & 6 & 7 & 8 & x & x & x & x\\ + x & x & x & x & 14 & 15 & 16 & 17 & 18 & x & x & x & x\\ + x & x & x & x & 24 & 25 & 26 & 27 & 28 & x & x & x & x\\ + 0 & 31 & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 & 0 & 0\\ + 0 & 41 & 42 & 43 & 44 & 45 & 46 & 47 & 48 & 49 & 50 & 0 & 0\\ + 0 & 51 & 52 & 53 & 54 & 55 & 56 & 57 & 58 & 59 & 60 & 0 & 0\\ + 0 & 61 & 62 & 63 & 64 & 65 & 66 & 67 & 68 & 69 & 70 & 0 & 0\\ + 0 & 71 & 72 & 73 & 74 & 75 & 76 & 77 & 78 & 79 & 80 & 0 & 0\\ + x & x & x & x & 84 & 85 & 86 & 87 & 88 & x & x & x & x\\ + x & x & x & x & 94 & 95 & 96 & 79 & 98 & x & x & x & x\\ + x & x & x & x & 0 & 0 & 0 & 0 & 0 & x & x & x & x\\ + x & x & x & x & 0 & 0 & 0 & 0 & 0 & x & x & x & x + \end{bmatrix} +\f] output: - +``` [[[[ 0 0] [ 0 89]] @@ -249,25 +255,28 @@ Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We u [[ 23 0] [ 0 0]]]] - +``` output shape: `[1, 16, 2, 2]` 4. `sizes="3,3", strides="5,5", rates="2,2", auto_pad="valid"` This time we use the symbols `x`, `y`, `z` and `k` to distinguish the patches: - -   x   2   x   4   x   y   7   y   9   y - 11 12 13 14 15 16 17 18 19 20 -   x  22   x 24   x   y 27   y 29   y - 31 32 33 34 35 36 37 38 39 40 -   x  42   x 44   x   y 47   y 49   y -   z  52   z 54   z   k 57   k 59   k - 61 62 63 64 65 66 67 68 69 70 -   z  72   z 74   z   k 77   k 79   k - 81 82 83 84 85 86 87 88 89 90 -   z  92   z 94   z   k 97   k 99   k - - output: +\f[ + \begin{bmatrix} + x & 2 & x & 4 & x & y & 7 & y & 9 & y \\ + 11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 & 20 \\ + x & 22 & x & 24 & x & y & 27 & y & 29 & y \\ + 31 & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 \\ + x & 42 & x & 44 & x & y & 47 & y & 49 & y \\ + z & 52 & z & 54 & z & k & 57 & k & 59 & k \\ + 61 & 62 & 63 & 64 & 65 & 66 & 67 & 68 & 69 & 70 \\ + z & 72 & z & 74 & z & k & 77 & k & 79 & k \\ + 81 & 82 & 83 & 84 & 85 & 86 & 87 & 88 & 89 & 90 \\ + z & 92 & z & 94 & z & k & 79 & k & 99 & k + \end{bmatrix} +\f] + output: +``` [[[[ 1 6] [ 51 56]] @@ -294,26 +303,30 @@ This time we use the symbols `x`, `y`, `z` and `k` to distinguish the patches: [[ 45 50] [ 95 100]]]] - +``` output_shape: `[1, 9, 2, 2]` 5. `sizes="2,2", strides="3,3", rates="1,1", auto_pad="valid"` Image is a `1 x 2 x 5 x 5` array that contains two feature maps where feature map with coordinate 0 contains numbers in a range `[1, 25]` and feature map with coordinate 1 contains numbers in a range `[26, 50]` -   x   x   3   x   x -   6   7   8   x   x - 11 12 13 14 15 -   x   x  18   x   x -   x   x  23   x   x - -   x   x  28   x   x -   x   x  33   x   x - 36 37 38 39 40 -   x   x  43   x   x -   x   x  48   x   x - +\f[ + \begin{bmatrix} + x & x & 3 & x & x\\ + x & x & 8 & x & x\\ + 11 & 12 & 13 & 14 & 15\\ + x & x & 18 & x & x\\ + x & x & 23 & x & x + \end{bmatrix}\\ + \begin{bmatrix} + x & x & 28 & x & x\\ + x & x & 33 & x & x\\ + 36 & 37 & 38 & 39 & 40\\ + x & x & 43 & x & x\\ + x & x & 48 & x & x + \end{bmatrix} +\f] output: - +``` [[[[ 1 4] [16 19]] @@ -337,5 +350,5 @@ Image is a `1 x 2 x 5 x 5` array that contains two feature maps where feature ma [[32 35] [47 50]]]] - +``` output shape: `[1, 8, 2, 2]` diff --git a/docs/ops/movement/Gather_8.md b/docs/ops/movement/Gather_8.md new file mode 100644 index 00000000000000..bdb687bd467c02 --- /dev/null +++ b/docs/ops/movement/Gather_8.md @@ -0,0 +1,200 @@ +## Gather {#openvino_docs_ops_movement_Gather_8} + +**Versioned name**: *Gather-8* + +**Category**: Data movement operations + +**Short description**: *Gather* operation takes slices of data of the first input tensor according to the indices + specified with the second input tensor and axis from the third input. Semantics of this operation is identical to +TensorFlow\* [Gather](https://www.tensorflow.org/api_docs/python/tf/gather) operation. + +**Detailed description** + + output[p_0, p_1, ..., p_{axis-1}, i_b, ..., i_{M-1}, p_{axis+1}, ..., p_{N-1}] = + data[p_0, p_1, ..., p_{axis-1}, indices[p_0, p_1, ..., p_{b-1}, i_b, ..., i_{M-1}], p_{axis+1}, ..., p_{N-1}] + +Where `data`, `indices` and `axis` are tensors from first, second and third inputs correspondingly, `b` is +the number of batch dimensions. `N` and `M` are numbers of dimensions of `data` and `indices` tensors, respectively. + +**Attributes**: +* *batch_dims* + * **Description**: *batch_dims* (also denoted as `b`) is a leading number of dimensions of `data` tensor and `indices` + representing the batches, and *Gather* starts to gather from the `b` dimension. It requires the first `b` + dimensions in `data` and `indices` tensors to be equal. If `batch_dims` is less than zero, normalized value is used + `batch_dims = indices.rank + batch_dims`. + * **Range of values**: `[-min(data.rank, indices.rank); min(data.rank, indices.rank)]` and `batch_dims' <= axis'`. + Where `batch_dims'` and `axis'` stand for normalized `batch_dims` and `axis` values. + * **Type**: *T_AXIS* + * **Default value**: 0 + * **Required**: *no* + +Example 1 with default *batch_dims* value: +``` +batch_dims = 0 +axis = 0 + +indices = [0, 0, 4] +data = [1, 2, 3, 4, 5] +output = [1, 1, 5] +``` + +Example 2 with non-default *batch_dims* value: +``` +batch_dims = 1 +axis = 1 + +indices = [[0, 0, 4], <-- this is applied to the first batch + [4, 0, 0]] <-- this is applied to the second batch +indices_shape = (2, 3) + +data = [[1, 2, 3, 4, 5], <-- the first batch + [6, 7, 8, 9, 10]] <-- the second batch +data_shape = (2, 5) + +output = [[ 1, 1, 5], + [10, 6, 6]] +output_shape = (2, 3) +``` + +Example 3 with non-default *batch_dims* value: +``` +batch_dims = 2 +axis = 2 + +indices = [[[0, 0, 4], <-- this is applied to the first batch, index = (0, 0) + [4, 0, 0]], <-- this is applied to the second batch, index = (0, 1) + + [[1, 2, 4], <-- this is applied to the third batch, index = (1, 0) + [4, 3, 2]]] <-- this is applied to the fourth batch, index = (1, 1) +indices_shape = (2, 2, 3) + +data = [[[1, 2, 3, 4, 5], <-- the first batch, index = (0, 0) + [6, 7, 8, 9, 10]], <-- the second batch, index = (0, 1) + + [[11, 12, 13, 14, 15], <-- the third batch, index = (1, 0) + [16, 17, 18, 19, 20]]] <-- the fourth batch, index = (1, 1) +data_shape = (2, 2, 5) + +output = [[[ 1, 1, 5], + [10, 6, 6]], + + [[12, 13, 15], + [20, 19, 18]]] +output_shape = (2, 2, 3) +``` +Example 4 with *axis* > *batch_dims*: +``` +batch_dims = 1 +axis = 2 + +indices = [[1, 2, 4], <-- this is applied to the first batch + [4, 3, 2]] <-- this is applied to the second batch +indices_shape = (2, 3) + +data = [[[[ 1, 2, 3, 4], <-- first batch + [ 5, 6, 7, 8], + [ 9, 10, 11, 12], + [13, 14, 15, 16], + [17, 18, 19, 20]]], + + [[[21, 22, 23, 24], <-- second batch + [25, 26, 27, 28], + [29, 30, 31, 32], + [33, 34, 35, 36], + [37, 38, 39, 40]]]] +data_shape = (2, 1, 5, 4) + +output = [[[[ 5, 6, 7, 8], + [ 9, 10, 11, 12], + [17, 18, 19, 20]]], + + [[[37, 38, 39, 40], + [33, 34, 35, 36], + [29, 30, 31, 32]]]] +output_shape = (2, 1, 3, 4) +``` + +Example 5 with negative *batch_dims* value: +``` +batch_dims = -1 <-- normalized value will be indices.rank + batch_dims = 2 - 1 = 1 +axis = 1 + +indices = [[0, 0, 4], <-- this is applied to the first batch + [4, 0, 0]] <-- this is applied to the second batch +indices_shape = (2, 3) + +data = [[1, 2, 3, 4, 5], <-- the first batch + [6, 7, 8, 9, 10]] <-- the second batch +data_shape = (2, 5) + +output = [[ 1, 1, 5], + [10, 6, 6]] +output_shape = (2, 3) +``` + +Example 6 with negative indices: +``` +batch_dims = 0 +axis = 0 + +indices = [0, -2, -1] +data = [1, 2, 3, 4, 5] +output = [1, 4, 5] +``` + +**Inputs** + +* **1**: `data` tensor of type *T* with arbitrary data. **Required**. + +* **2**: `indices` tensor of type *T_IND* with indices to gather. 0D tensor (scalar) for indices is also allowed. + The values for indices are in the range `[-data[axis], data[axis] - 1]`. + Negative values of indices indicate reverse indexing from `data[axis]`. + **Required**. + +* **3**: Scalar or 1D tensor `axis` of *T_AXIS* type is a dimension index to gather data from. For example, +*axis* equal to 1 means that gathering is performed over the first dimension. Negative `axis` means reverse indexing and + will be normalized to value `axis = data.rank + axis`. Allowed values are from `[-len(data.shape), len(data.shape) - 1]` + and `axis' >= batch_dims'`. Where `axis'` and `batch_dims'` stand for normalized `batch_dims` and `axis` values. +**Required**. + +**Outputs** + +* **1**: The resulting tensor of type *T* that consists of elements from `data` tensor gathered by `indices`. The shape +of the output tensor is `data.shape[:axis] + indices.shape[batch_dims:] + data.shape[axis + 1:]` + +**Types** + +* *T*: any supported type. + +* *T_IND*: any supported integer types. + +* *T_AXIS*: any supported integer types. + +**Example** + +```xml + + + + + 2 + 64 + 128 + + + 2 + 32 + 21 + + + + + + 2 + 32 + 21 + 128 + + + +``` diff --git a/docs/ops/movement/ShuffleChannels_1.md b/docs/ops/movement/ShuffleChannels_1.md index ec7cfc75d9db6f..e97f3350d2a405 100644 --- a/docs/ops/movement/ShuffleChannels_1.md +++ b/docs/ops/movement/ShuffleChannels_1.md @@ -8,12 +8,37 @@ **Short description**: *ShuffleChannels* permutes data in the channel dimension of the input tensor. +**Detailed description**: + +Input tensor of `data_shape` is always interpreted as 4D tensor with the following shape: + + dim 0: data_shape[0] * data_shape[1] * ... * data_shape[axis-1] + (or 1 if axis == 0) + dim 1: group + dim 2: data_shape[axis] / group + dim 3: data_shape[axis+1] * data_shape[axis+2] * ... * data_shape[data_shape.size()-1] + (or 1 if axis points to last dimension) + + +Trailing and leading to `axis` dimensions are flattened and reshaped back to the original shape after channels shuffling. + + +The operation is equivalent to the following transformation of the input tensor `x` of shape `[N, C, H, W]` and `axis = 1`: + +\f[ +x' = reshape(x, [N, group, C / group, H * W])\\ +x'' = transpose(x', [0, 2, 1, 3])\\ +y = reshape(x'', [N, C, H, W])\\ +\f] + +where `group` is the layer attribute described below. + **Attributes**: * *axis* * **Description**: *axis* specifies the index of a channel dimension. - * **Range of values**: an integer number in the range [-4, 3] + * **Range of values**: an integer number in the range `[-rank(data_shape), rank(data_shape) - 1]` * **Type**: `int` * **Default value**: 1 * **Required**: *No* @@ -21,30 +46,22 @@ * *group* * **Description**: *group* specifies the number of groups to split the channel dimension into. This number must evenly divide the channel dimension size. - * **Range of values**: a positive integer + * **Range of values**: a positive integer in the range `[1, data_shape[axis]]` * **Type**: `int` * **Default value**: 1 * **Required**: *No* **Inputs**: -* **1**: 4D input tensor of any supported data type. Required. +* **1**: `data` input tensor of type *T* and rank greater or equal to 1. **Required.** **Outputs**: -* **1**: 4D input tensor with shape and element type as for the input tensor. +* **1**: Output tensor with element type *T* and same shape as the input tensor. -**Mathematical Formulation** +**Types** -The operation is the equivalent with the following transformation of the input tensor *x* of shape *[N, C, H, W]*: - -``` -x' = reshape(x, [N, group, C / group, H * W]) -x'' = transpose(x', [0, 2, 1, 3]) -y = reshape(x'', [N, C, H, W]) -``` - -where `group` is the layer parameter described above and the `axis = 1`. +* *T*: any supported numeric type. **Example** @@ -68,4 +85,4 @@ where `group` is the layer parameter described above and the `axis = 1`. -``` \ No newline at end of file +``` diff --git a/docs/ops/movement/SpaceToBatch_2.md b/docs/ops/movement/SpaceToBatch_2.md index 66c064e27bee35..eea9df8ced1acf 100644 --- a/docs/ops/movement/SpaceToBatch_2.md +++ b/docs/ops/movement/SpaceToBatch_2.md @@ -8,20 +8,20 @@ **Detailed description**: -The *SpaceToBatch* operation is similar to the TensorFlow* operation [SpaceToBatchND](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd) - The operation is equivalent to the following transformation of the input tensor `data` of shape `[batch, D_1, D_2 ... D_{N - 1}]` and `block_shape`, `pads_begin`, `pads_end` of shapes `[N]` to *Y* output tensor. - Zero-pad the start and end of dimensions [D_0, ..., D_{N - 1}] of the input according to `pads_begin` and `pads_end`: - note: P_0 for batch dimension is expected to be 0 (no-padding). - x = [batch + P_0, D_1 + P_1, D_2 + P_2, ..., D_{N - 1} + P_{N - 1}], where P_i = pads_begin[i] + pads_end[i] - - note: B_0 for batch is ignored. - x' = reshape(x, [batch, (D_1 + P_1) / B_1, B_1, (D_2 + P_2) / B_2, B_2, ..., (D_{N - 1} + P_{N - 1}) / B_{N - 1}, B_{N - 1}]), where B_i = block_shape[i] +Zero-pad the start and end of dimensions \f$[D_0, \dots, D_{N - 1}]\f$ of the input according to `pads_begin` and `pads_end`: - x'' = transpose(x', [2, 4, ..., (N - 1) + (N - 1), 0, 1, 3, ..., N + (N - 1)]) +\f[x = [batch + P_0, D_1 + P_1, D_2 + P_2, \dots, D_{N - 1} + P_{N - 1}]\f] +\f[x' = reshape(x, [batch, \frac{D_1 + P_1}{B_1}, B_1, \frac{D_2 + P_2}{B_2}, B_2, \dots, \frac{D_{N - 1} + P_{N - 1}}{B_{N - 1}}, B_{N - 1}])\f] +\f[x'' = transpose(x', [2, 4, \dots, (N - 1) + (N - 1), 0, 1, 3, \dots, N + (N - 1)])\f] +\f[y = reshape(x'', [batch \times B_1 \times \dots \times B_{N - 1}, \frac{D_1 + P_1}{B_1}, \frac{D_2 + P_2}{B_2}, \dots, \frac{D_{N - 1} + P_{N - 1}}{B_{N - 1}}]\f] - y = reshape(x'', [batch * B_1 * ... * B_{N - 1}, (D_1 + P_1) / B_1, (D_2 + P_2) / B_2, ... , (D_{N - 1} + P_{N - 1}) / B_{N - 1}]) +where +- \f$P_i\f$ = pads_begin[i] + pads_end[i] +- \f$B_i\f$ = block_shape[i] +- \f$P_0\f$ for batch dimension is expected to be 0 (no-padding) +- \f$B_0\f$ for batch is ignored **Attributes** @@ -36,7 +36,7 @@ The operation is equivalent to the following transformation of the input tensor **Outputs** -* **1**: N-D tensor with shape `[batch * block_shape[0] * block_shape[1] * ... * block_shape[N - 1], (pads_begin[1] + D_1 + pads_end[1]) / block_shape[1], (pads_begin[2] + D_2 + pads_end[2]) / block_shape[2], ..., (pads_begin[N - 1] + D_{N - 1} + pads_end[N - 1]) / block_shape[N - 1]` of the same type as `data` input. +* **1**: N-D tensor with shape `[batch * block_shape[0] * block_shape[1] * ... * block_shape[N - 1], (D_1 + pads_begin[1] + pads_end[1]) / block_shape[1], (D_2 + pads_begin[2] + pads_end[2]) / block_shape[2], ..., (D_{N -1} + pads_begin[N - 1] + pads_end[N - 1]) / block_shape[N - 1]` of the same type as `data` input. **Types** diff --git a/docs/ops/movement/StridedSlice_1.md b/docs/ops/movement/StridedSlice_1.md index 6c07665d8f930c..41742e20652b13 100644 --- a/docs/ops/movement/StridedSlice_1.md +++ b/docs/ops/movement/StridedSlice_1.md @@ -4,14 +4,13 @@ **Category**: Data movement operation -**Short description**: *StridedSlice* extracts a strided slice of a tensor. - It is similar to generalized array indexing in Python\*. +**Short description**: *StridedSlice* extracts a strided slice of a tensor. **Attributes** * *begin_mask* - * **Description**: *begin_mask* is a bit mask. *begin_mask[i]* equal to 1 means that the corresponding dimension of the `begin` input is ignored and the 'real' beginning of the tensor is used along corresponding dimension. + * **Description**: *begin_mask* is a bit mask. *begin_mask[i]* equal to `1` means that the corresponding dimension of the `begin` input is ignored and the 'real' beginning of the tensor is used along corresponding dimension. * **Range of values**: a list of `0`s and `1`s * **Type**: `int[]` * **Default value**: None @@ -19,7 +18,7 @@ * *end_mask* - * **Description**: *end_mask* is a bit mask. If *end_mask[i]* is 1, the corresponding dimension of the `end` input is ignored and the real 'end' of the tensor is used along corresponding dimension. + * **Description**: *end_mask* is a bit mask. If *end_mask[i]* is `1`, the corresponding dimension of the `end` input is ignored and the real 'end' of the tensor is used along corresponding dimension. * **Range of values**: a list of `0`s and `1`s * **Type**: `int[]` * **Default value**: None @@ -27,7 +26,7 @@ * *new_axis_mask* - * **Description**: *new_axis_mask* is a bit mask. If *new_axis_mask[i]* is 1, a length 1 dimension is inserted on the `i`-th position of input tensor. + * **Description**: *new_axis_mask* is a bit mask. If *new_axis_mask[i]* is `1`, a length 1 dimension is inserted on the `i`-th position of input tensor. * **Range of values**: a list of `0`s and `1`s * **Type**: `int[]` * **Default value**: `[0]` @@ -35,7 +34,7 @@ * *shrink_axis_mask* - * **Description**: *shrink_axis_mask* is a bit mask. If *shrink_axis_mask[i]* is 1, the dimension on the `i`-th position is deleted. + * **Description**: *shrink_axis_mask* is a bit mask. If *shrink_axis_mask[i]* is `1`, the dimension on the `i`-th position is deleted. * **Range of values**: a list of `0`s and `1`s * **Type**: `int[]` * **Default value**: `[0]` @@ -51,21 +50,83 @@ **Inputs**: -* **1**: Multidimensional input tensor to be sliced. Required. +* **1**: `data` - input tensor to be sliced of type `T` and arbitrary shape. **Required.** -* **2**: `begin` input - 1D input tensor with begin indexes for input tensor slicing. Required. - Out-of-bounds values are silently clamped. If `begin_mask[i]` is 1, the value of `begin[i]` is ignored - and the range of the appropriate dimension starts from 0. - Negative values mean indexing starts from the end. For example, if `foo=[1,2,3]`, `begin[0]=-1` means `begin[0]=3`. +* **2**: `begin` - 1D tensor of type `T_IND` with begin indexes for input tensor slicing. **Required.** + Out-of-bounds values are silently clamped. If `begin_mask[i]` is `1`, the value of `begin[i]` is ignored and the range of the appropriate dimension starts from `0`. Negative values mean indexing starts from the end. For example, if `data=[1,2,3]`, `begin[0]=-1` means `begin[0]=3`. -* **3**: `end` input - 1D input tensor with end indexes for input tensor slicing. Required. - Out-of-bounds values will be silently clamped. If `end_mask[i]` is 1, the value of `end[i]` is ignored - and the full range of the appropriate dimension is used instead. - Negative values mean indexing starts from the end. For example, if `foo=[1,2,3]`, `end[0]=-1` means `end[0]=3`. +* **3**: `end` - 1D tensor of type `T_IND` with end indexes for input tensor slicing. **Required.** + Out-of-bounds values will be silently clamped. If `end_mask[i]` is `1`, the value of `end[i]` is ignored and the full range of the appropriate dimension is used instead. Negative values mean indexing starts from the end. For example, if `data=[1,2,3]`, `end[0]=-1` means `end[0]=3`. -* **4**: `stride` input - 1D input tensor with strides. Optional. +* **4**: `stride` - 1D tensor of type `T_IND` with strides. **Optional.** -**Example** +**Types** +* *T*: any supported type. +* *T_IND*: any supported integer type. + +**Example** +Example of `begin_mask` & `end_mask` usage. +```xml + + + + + 2 + 3 + 4 + + + 2 + + + 2 + + + 2 + + + + + 1 + 3 + 2 + + + +``` + +Example of `new_axis_mask` usage. +```xml + + + + + 2 + 3 + 4 + + + 2 + + + 2 + + + 2 + + + + + 1 + 2 + 3 + 4 + + + +``` + +Example of `shrink_axis_mask` usage. ```xml @@ -96,4 +157,4 @@ -``` \ No newline at end of file +``` diff --git a/docs/ops/opset8.md b/docs/ops/opset8.md new file mode 100644 index 00000000000000..02e97eab4e42f6 --- /dev/null +++ b/docs/ops/opset8.md @@ -0,0 +1,169 @@ +# Operation Set `opset8` Specification {#openvino_docs_ops_opset8} + +This specification document describes the `opset8` operation set supported in OpenVINO™. +Support for each particular operation from the list below depends on the capabilities of an inference plugin +and may vary among different hardware platforms and devices. Examples of operation instances are provided as IR V10 xml +snippets. Such IR is generated by the Model Optimizer. The semantics match corresponding nGraph operation classes +declared in `namespace opset8`. + + +## Table of Contents + +* [Abs](arithmetic/Abs_1.md) +* [Acos](arithmetic/Acos_1.md) +* [Acosh](arithmetic/Acosh_3.md) +* [AdaptiveAvgPool](pooling/AdaptiveAvgPool_8.md) +* [AdaptiveMaxPool](pooling/AdaptiveMaxPool_8.md) +* [Add](arithmetic/Add_1.md) +* [Asin](arithmetic/Asin_1.md) +* [Asinh](arithmetic/Asinh_3.md) +* [Assign](infrastructure/Assign_3.md) +* [Atan](arithmetic/Atan_1.md) +* [Atanh](arithmetic/Atanh_3.md) +* [AvgPool](pooling/AvgPool_1.md) +* [BatchNormInference](normalization/BatchNormInference_5.md) +* [BatchToSpace](movement/BatchToSpace_2.md) +* [BinaryConvolution](convolution/BinaryConvolution_1.md) +* [Broadcast](movement/Broadcast_3.md) +* [Bucketize](condition/Bucketize_3.md) +* [CTCGreedyDecoder](sequence/CTCGreedyDecoder_1.md) +* [CTCGreedyDecoderSeqLen](sequence/CTCGreedyDecoderSeqLen_6.md) +* [CTCLoss](sequence/CTCLoss_4.md) +* [Ceiling](arithmetic/Ceiling_1.md) +* [Clamp](activation/Clamp_1.md) +* [Concat](movement/Concat_1.md) +* [Constant](infrastructure/Constant_1.md) +* [Convert](type/Convert_1.md) +* [ConvertLike](type/ConvertLike_1.md) +* [Convolution](convolution/Convolution_1.md) +* [ConvolutionBackpropData](convolution/ConvolutionBackpropData_1.md) +* [Cos](arithmetic/Cos_1.md) +* [Cosh](arithmetic/Cosh_1.md) +* [CumSum](arithmetic/CumSum_3.md) +* [DeformableConvolution](convolution/DeformableConvolution_8.md) +* [DeformablePSROIPooling](detection/DeformablePSROIPooling_1.md) +* [DepthToSpace](movement/DepthToSpace_1.md) +* [DetectionOutput](detection/DetectionOutput_1.md) +* [DFT](signals/DFT_7.md) +* [Divide](arithmetic/Divide_1.md) +* [Einsum](matrix/Einsum_7.md) +* [Elu](activation/Elu_1.md) +* [EmbeddingBagOffsetsSum](sparse/EmbeddingBagOffsetsSum_3.md) +* [EmbeddingBagPackedSum](sparse/EmbeddingBagPackedSum_3.md) +* [EmbeddingSegmentsSum](sparse/EmbeddingSegmentsSum_3.md) +* [Equal](comparison/Equal_1.md) +* [Erf](arithmetic/Erf_1.md) +* [Exp](activation/Exp_1.md) +* [ExperimentalDetectronDetectionOutput_6](detection/ExperimentalDetectronDetectionOutput_6.md) +* [ExperimentalDetectronGenerateProposalsSingleImage_6](detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md) +* [ExperimentalDetectronPriorGridGenerator_6](detection/ExperimentalDetectronPriorGridGenerator_6.md) +* [ExperimentalDetectronROIFeatureExtractor_6](detection/ExperimentalDetectronROIFeatureExtractor_6.md) +* [ExperimentalDetectronTopKROIs_6](sort/ExperimentalDetectronTopKROIs_6.md) +* [ExtractImagePatches](movement/ExtractImagePatches_3.md) +* [FakeQuantize](quantization/FakeQuantize_1.md) +* [Floor](arithmetic/Floor_1.md) +* [FloorMod](arithmetic/FloorMod_1.md) +* [Gather](movement/Gather_8.md) +* [GatherElements](movement/GatherElements_6.md) +* [GatherND_5](movement/GatherND_5.md) +* [GatherTree](movement/GatherTree_1.md) +* [Gelu](activation/GELU_7.md) +* [Greater](comparison/Greater_1.md) +* [GreaterEqual](comparison/GreaterEqual_1.md) +* [GRN](normalization/GRN_1.md) +* [GroupConvolution](convolution/GroupConvolution_1.md) +* [GroupConvolutionBackpropData](convolution/GroupConvolutionBackpropData_1.md) +* [GRUCell](sequence/GRUCell_3.md) +* [GRUSequence](sequence/GRUSequence_5.md) +* [HardSigmoid](activation/HardSigmoid_1.md) +* [HSigmoid](activation/HSigmoid_5.md) +* [HSwish](activation/HSwish_4.md) +* [IDFT](signals/IDFT_7.md) +* [Interpolate](image/Interpolate_4.md) +* [Less](comparison/Less_1.md) +* [LessEqual](comparison/LessEqual_1.md) +* [Log](arithmetic/Log_1.md) +* [LogicalAnd](logical/LogicalAnd_1.md) +* [LogicalNot](logical/LogicalNot_1.md) +* [LogicalOr](logical/LogicalOr_1.md) +* [LogicalXor](logical/LogicalXor_1.md) +* [LogSoftmax](activation/LogSoftmax_5.md) +* [Loop](infrastructure/Loop_5.md) +* [LRN](normalization/LRN_1.md) +* [LSTMCell](sequence/LSTMCell_1.md) +* [LSTMSequence](sequence/LSTMSequence_1.md) +* [MatMul](matrix/MatMul_1.md) +* [MaxPool](pooling/MaxPool_1.md) +* [Maximum](arithmetic/Maximum_1.md) +* [Minimum](arithmetic/Minimum_1.md) +* [Mish](activation/Mish_4.md) +* [Mod](arithmetic/Mod_1.md) +* [MVN](normalization/MVN_6.md) +* [Multiply](arithmetic/Multiply_1.md) +* [Negative](arithmetic/Negative_1.md) +* [NonMaxSuppression](sort/NonMaxSuppression_5.md) +* [NonZero](condition/NonZero_3.md) +* [NormalizeL2](normalization/NormalizeL2_1.md) +* [NotEqual](comparison/NotEqual_1.md) +* [OneHot](sequence/OneHot_1.md) +* [Pad](movement/Pad_1.md) +* [Parameter](infrastructure/Parameter_1.md) +* [Power](arithmetic/Power_1.md) +* [PReLU](activation/PReLU_1.md) +* [PriorBoxClustered](detection/PriorBoxClustered_1.md) +* [PriorBox](detection/PriorBox_1.md) +* [Proposal](detection/Proposal_4.md) +* [PSROIPooling](detection/PSROIPooling_1.md) +* [Range](generation/Range_4.md) +* [ReLU](activation/ReLU_1.md) +* [ReadValue](infrastructure/ReadValue_3.md) +* [ReduceL1](reduction/ReduceL1_4.md) +* [ReduceL2](reduction/ReduceL2_4.md) +* [ReduceLogicalAnd](reduction/ReduceLogicalAnd_1.md) +* [ReduceLogicalOr](reduction/ReduceLogicalOr_1.md) +* [ReduceMax](reduction/ReduceMax_1.md) +* [ReduceMean](reduction/ReduceMean_1.md) +* [ReduceMin](reduction/ReduceMin_1.md) +* [ReduceProd](reduction/ReduceProd_1.md) +* [ReduceSum](reduction/ReduceSum_1.md) +* [RegionYolo](detection/RegionYolo_1.md) +* [ReorgYolo](detection/ReorgYolo_1.md) +* [Reshape](shape/Reshape_1.md) +* [Result](infrastructure/Result_1.md) +* [ReverseSequence](movement/ReverseSequence_1.md) +* [RNNCell](sequence/RNNCell_3.md) +* [RNNSequence](sequence/RNNSequence_5.md) +* [ROIAlign](detection/ROIAlign_3.md) +* [ROIPooling](detection/ROIPooling_1.md) +* [Roll](movement/Roll_7.md) +* [Round](arithmetic/Round_5.md) +* [ScatterElementsUpdate](movement/ScatterElementsUpdate_3.md) +* [ScatterNDUpdate](movement/ScatterNDUpdate_3.md) +* [ScatterUpdate](movement/ScatterUpdate_3.md) +* [Select](condition/Select_1.md) +* [Selu](activation/Selu_1.md) +* [ShapeOf](shape/ShapeOf_3.md) +* [ShuffleChannels](movement/ShuffleChannels_1.md) +* [Sigmoid](activation/Sigmoid_1.md) +* [Sign](arithmetic/Sign_1.md) +* [Sin](arithmetic/Sin_1.md) +* [Sinh](arithmetic/Sinh_1.md) +* [SoftMax](activation/SoftMax_1.md) +* [SoftPlus](activation/SoftPlus_4.md) +* [SpaceToBatch](movement/SpaceToBatch_2.md) +* [SpaceToDepth](movement/SpaceToDepth_1.md) +* [Split](movement/Split_1.md) +* [Sqrt](arithmetic/Sqrt_1.md) +* [SquaredDifference](arithmetic/SquaredDifference_1.md) +* [Squeeze](shape/Squeeze_1.md) +* [StridedSlice](movement/StridedSlice_1.md) +* [Subtract](arithmetic/Subtract_1.md) +* [Swish](activation/Swish_4.md) +* [Tan](arithmetic/Tan_1.md) +* [Tanh](arithmetic/Tanh_1.md) +* [TensorIterator](infrastructure/TensorIterator_1.md) +* [Tile](movement/Tile_1.md) +* [TopK](sort/TopK_3.md) +* [Transpose](movement/Transpose_1.md) +* [Unsqueeze](shape/Unsqueeze_1.md) +* [VariadicSplit](movement/VariadicSplit_1.md) diff --git a/docs/ops/pooling/AdaptiveAvgPool_8.md b/docs/ops/pooling/AdaptiveAvgPool_8.md new file mode 100644 index 00000000000000..beb2ec30492d3e --- /dev/null +++ b/docs/ops/pooling/AdaptiveAvgPool_8.md @@ -0,0 +1,70 @@ +## AdaptiveAvgPool {#openvino_docs_ops_pooling_AdaptiveAvgPool_8} + +**Versioned name**: *AdaptiveAvgPool-8* + +**Category**: *Pooling* + +**Short description**: Applies average pooling with adaptive kernel size over the input. + +**Detailed description**: This operation calculates the output based on the first input and `output_size` determined by the second input. +The kernel dimensions are calculated using the following formulae for the `NCDHW` input case: + +\f[ +\begin{array}{lcl} +d_{start} &=& floor(i*D_{in}/D_{out})\\ +d_{end} &=& ceil((i+1)*D_{in}/D_{out})\\ +h_{start} &=& floor(j*H_{in}/H_{out})\\ +h_{end} &=& ceil((j+1)*H_{in}/H_{out})\\ +w_{start} &=& floor(k*W_{in}/W_{out})\\ +w_{end} &=& ceil((k+1)*W_{in}/W_{out}) +\end{array} +\f] + +The output is calculated with the following formula: + +\f[ +Output(i,j,k) = \frac{Input[d_{start}:d_{end}, h_{start}:h_{end}, w_{start}:w_{end}]}{(d_{end}-d_{start})*(h_{end}-h_{start})*(w_{end}-w_{start})} +\f] + +**Inputs**: + +* **1**: 3D, 4D, or 5D input tensor of shape `[N, C, H]`, `[N, C, H, W]` or `[N, C, D, H, W]` and type *T*. Required. +* **2**: 1D tensor describing output shape for spatial dimensions. Can be `[H_out]` for 3D input, `[H_out, W_out]` for 4D input, `[D_out, H_out, W_out]` for 5D input and of type *T_SHAPE*. Required. + +**Outputs**: + +* **1**: Output of type *T* and shape `[N, C, H_out]`, `[N, C, H_out, W_out]` or `[N, C, D_out, H_out, W_out]`. + +**Types** + +* *T*: floating-point type. +* *T_SHAPE*: `int32` or `int64`. + +**Examples** + +```xml + + + + + 1 + 3 + 32 + 32 + + + + + 2 + + + + + 1 + 3 + 16 + 16 + + + +``` diff --git a/docs/ops/pooling/AdaptiveMaxPool_8.md b/docs/ops/pooling/AdaptiveMaxPool_8.md new file mode 100644 index 00000000000000..d7ad9a42412bd9 --- /dev/null +++ b/docs/ops/pooling/AdaptiveMaxPool_8.md @@ -0,0 +1,87 @@ +## AdaptiveMaxPool {#openvino_docs_ops_pooling_AdaptiveMaxPool_8} + +**Versioned name**: *AdaptiveMaxPool-8* + +**Category**: *Pooling* + +**Short description**: Applies max pooling with adaptive kernel size over the input. + +**Detailed description**: This operation calculates the output based on the first input and `output_size` determined by the second input. +The kernel dimensions are calculated using the following formulae for the `NCDHW` input case: + +\f[ +\begin{array}{lcl} +d_{start} &=& floor(i*D_{in}/D_{out})\\ +d_{end} &=& ceil((i+1)*D_{in}/D_{out})\\ +h_{start} &=& floor(j*H_{in}/H_{out})\\ +h_{end} &=& ceil((j+1)*H_{in}/H_{out})\\ +w_{start} &=& floor(k*W_{in}/W_{out})\\ +w_{end} &=& ceil((k+1)*W_{in}/W_{out}) +\end{array} +\f] + +The output is calculated following this formula: + +\f[ +Output(i,j,k) = max(Input[d_{start}:d_{end}, h_{start}:h_{end}, w_{start}:w_{end}]) +\f] + +**Attributes**: + +* *index_element_type* + + * **Description**: the type of the second output containing indices + * **Range of values**: "i64" or "i32" + * **Type**: string + * **Default value**: "i64" + * **Required**: *No* + +**Inputs**: + +* **1**: 3D, 4D, or 5D input tensor of shape `[N, C, H]`, `[N, C, H, W]` or `[N, C, D, H, W]` and type *T*. Required. +* **2**: 1D tensor describing output shape for spatial dimensions. Can be `[H_out]` for 3D input, `[H_out, W_out]` for 4D input, `[D_out, H_out, W_out]` for 5D input and of type *T_SHAPE*. Required. + +**Outputs**: + +* **1**: Output of type *T* and shape `[N, C, H_out]`, `[N, C, H_out, W_out]` or `[N, C, D_out, H_out, W_out]`. +* **2**: Output of type specified by *index_element_type* and same shape as the first output containing indices of elements in the first output. The values of indices are computed as if input was flatten 1-D tensor, so the values are in the range `[0, N * C * H * W * D)`. + +**Types** + +* *T*: floating-point type. +* *T_SHAPE*: `int32` or `int64`. + +**Examples** + +```xml + + + + + 1 + 3 + 32 + 32 + + + + + 2 + + + + + 1 + 3 + 16 + 16 + + + 1 + 3 + 16 + 16 + + + +``` diff --git a/docs/ops/sort/MatrixNMS_8.md b/docs/ops/sort/MatrixNMS_8.md new file mode 100644 index 00000000000000..b578822d6ebd81 --- /dev/null +++ b/docs/ops/sort/MatrixNMS_8.md @@ -0,0 +1,178 @@ +## MatrixNonMaxSuppression {#openvino_docs_ops_sort_MatrixNms_8} + +**Versioned name**: *MatrixNonMaxSuppression-8* + +**Category**: *Sorting and maximization* + +**Short description**: *MatrixNonMaxSuppression* performs matrix non-maximum suppression (NMS) of the boxes with predicted scores. + +**Detailed description**: The operation performs the following: + +1. Selects candidate bounding boxes with scores higher than `score_threshold`. +2. For each class, selects at most `nms_top_k` candidate boxes. +3. Decays scores of the candidate boxes according to the Matrix NMS algorithm [Wang et al](https://arxiv.org/abs/2003.10152.pdf). This algorithm is applied independently to each class and each batch element. Boxes of `background_class` are skipped and thus eliminated during the process. +4. Selects boxes with the decayed scores higher than `post_threshold`, and selects at most `keep_top_k` scoring candidate boxes per batch element. + +The Matrix NMS algorithm is described below: +1. Sort descending the candidate boxes by score, and compute `n*n` pairwise IOU (IntersectionOverUnion) matrix `X` for the top `n` boxes. Suppose `n` is the number of candidate boxes. +2. Set the lower triangle and diagonal of `X` to 0. Therefore get the upper triangular matrix `X`. +3. Take the column-wise max of `X` to compute a vector `K` of maximum IOU for each candidate box. +4. Repeat element value of `K` along axis 1. Suppose this gets a matrix `X_cmax`. +5. Compute the decay factor: `decay_factor = exp((X_cmax**2 - X**2) * gaussian_sigma)` if `decay_function` is `guassian`, else `decay_factor = (1 - X) / (1 - X_cmax)`. +6. Take the column-wise min of `decay_factor`, and element-wise multiply with scores to decay them. + +**Attributes**: + +* *sort_result* + + * **Description**: *sort_result* specifies the order of output elements. + * **Range of values**: `class`, `score`, `none` + * *class* - sort selected boxes by class id (ascending). + * *score* - sort selected boxes by score (descending). + * *none* - do not guarantee the order. + * **Type**: `string` + * **Default value**: `none` + * **Required**: *No* + +* *sort_result_across_batch* + + * **Description**: *sort_result_across_batch* is a flag that specifies whenever it is necessary to sort selected boxes across batches or not. + * **Range of values**: true or false + * *true* - sort selected boxes across batches. + * *false* - do not sort selected boxes across batches (boxes are sorted per batch element). + * **Type**: boolean + * **Default value**: false + * **Required**: *No* + +* *output_type* + + * **Description**: the tensor type of outputs `selected_indices` and `valid_outputs`. + * **Range of values**: `i64` or `i32` + * **Type**: `string` + * **Default value**: `i64` + * **Required**: *No* + +* *score_threshold* + + * **Description**: minimum score to consider box for the processing. + * **Range of values**: a floating-point number + * **Type**: `float` + * **Default value**: `0` + * **Required**: *No* + +* *nms_top_k* + + * **Description**: maximum number of boxes to be selected per class. + * **Range of values**: an integer + * **Type**: `int` + * **Default value**: `-1` meaning to keep all boxes + * **Required**: *No* + +* *keep_top_k* + + * **Description**: maximum number of boxes to be selected per batch element. + * **Range of values**: an integer + * **Type**: `int` + * **Default value**: `-1` meaning to keep all boxes + * **Required**: *No* + +* *background_class* + + * **Description**: the background class id. + * **Range of values**: an integer + * **Type**: `int` + * **Default value**: `-1` meaning to keep all classes + * **Required**: *No* + +* *normalized* + + * **Description**: *normalized* is a flag that indicates whether `boxes` are normalized or not. + * **Range of values**: true or false + * *true* - the box coordinates are normalized. + * *false* - the box coordinates are not normalized. + * **Type**: boolean + * **Default value**: True + * **Required**: *No* + +* *decay_function* + + * **Description**: decay function used to decay scores. + * **Range of values**: `gaussian`, `linear` + * **Type**: `string` + * **Default value**: `linear` + * **Required**: *No* + +* *gaussian_sigma* + + * **Description**: gaussian_sigma parameter for gaussian decay_function. + * **Range of values**: a floating-point number + * **Type**: `float` + * **Default value**: `2.0` + * **Required**: *No* + +* *post_threshold* + + * **Description**: threshold to filter out boxes with low confidence score after decaying. + * **Range of values**: a floating-point number + * **Type**: `float` + * **Default value**: `0` + * **Required**: *No* + +**Inputs**: + +* **1**: `boxes` - tensor of type *T* and shape `[num_batches, num_boxes, 4]` with box coordinates. The box cooridnates are layout as `[xmin, ymin, xmax, ymax]`. **Required.** + +* **2**: `scores` - tensor of type *T* and shape `[num_batches, num_classes, num_boxes]` with box scores. **Required.** + +**Outputs**: + +* **1**: `selected_outputs` - tensor of type *T_THRESHOLDS* and shape `[number of selected boxes, 6]` containing the selected boxes with score and class as tuples `[class_id, box_score, xmin, ymin, xmax, ymax]`. + +* **2**: `selected_indices` - tensor of type *T_IND* and shape `[number of selected boxes, 1]` the selected indices in the flattened input `boxes`, which are absolute values cross batches. Therefore possible valid values are in the range `[0, num_batches * num_boxes - 1]`. + +* **3**: `selected_num` - 1D tensor of type *T_IND* and shape `[num_batches]` representing the number of selected boxes for each batch element. + +When there is no box selected, `selected_num` is filled with `0`. `selected_outputs` is an empty tensor of shape `[0, 6]`, and `selected_indices` is an empty tensor of shape `[0, 1]`. + +**Types** + +* *T*: floating point type. + +* *T_MAX_BOXES*: integer type. + +* *T_THRESHOLDS*: floating point type. + +* *T_IND*: `int64` or `int32`. + +**Example** + +```xml + + + + + 3 + 100 + 4 + + + 3 + 5 + 100 + + + + + -1 + 6 + + + -1 + 1 + + + 3 + + + +``` diff --git a/docs/ops/sort/MulticlassNMS_8.md b/docs/ops/sort/MulticlassNMS_8.md new file mode 100644 index 00000000000000..866254963fdfa8 --- /dev/null +++ b/docs/ops/sort/MulticlassNMS_8.md @@ -0,0 +1,171 @@ +## MulticlassNonMaxSuppression {#openvino_docs_ops_sort_MulticlassNonMaxSuppression_8} + +**Versioned name**: *MulticlassNonMaxSuppression-8* + +**Category**: *Sorting and maximization* + +**Short description**: *MulticlassNonMaxSuppression* performs multi-class non-maximum suppression of the boxes with predicted scores. + +**Detailed description**: *MulticlassNonMaxSuppression* is a multi-phase operation. It implements non-maximum suppression algorithm as described below: + +1. Let `B = [b_0,...,b_n]` be the list of initial detection boxes, `S = [s_0,...,s_N]` be the list of corresponding scores. +2. Let `D = []` be an initial collection of resulting boxes. Let `adaptive_threshold = iou_threshold`. +3. If `B` is empty, go to step 9. +4. Take the box with highest score. Suppose that it is the box `b` with the score `s`. +5. Delete `b` from `B`. +6. If the score `s` is greater than or equal to `score_threshold`, add `b` to `D`, else go to step 9. +7. If `nms_eta < 1` and `adaptive_threshold > 0.5`, update `adaptive_threshold *= nms_eta`. +8. For each input box `b_i` from `B` and the corresponding score `s_i`, set `s_i = 0` when `iou(b, b_i) > adaptive_threshold`, and go to step 3. +9. Return `D`, a collection of the corresponding scores `S`, and the number of elements in `D`. + +This algorithm is applied independently to each class of each batch element. The operation feeds at most `nms_top_k` scoring candidate boxes to this algorithm. +The total number of output boxes of each batch element must not exceed `keep_top_k`. +Boxes of `background_class` are skipped and thus eliminated. + +**Attributes**: + +* *sort_result* + + * **Description**: *sort_result* specifies the order of output elements. + * **Range of values**: `class`, `score`, `none` + * *class* - sort selected boxes by class id (ascending). + * *score* - sort selected boxes by score (descending). + * *none* - do not guarantee the order. + * **Type**: `string` + * **Default value**: `none` + * **Required**: *No* + +* *sort_result_across_batch* + + * **Description**: *sort_result_across_batch* is a flag that specifies whenever it is necessary to sort selected boxes across batches or not. + * **Range of values**: true or false + * *true* - sort selected boxes across batches. + * *false* - do not sort selected boxes across batches (boxes are sorted per batch element). + * **Type**: boolean + * **Default value**: false + * **Required**: *No* + +* *output_type* + + * **Description**: the tensor type of outputs `selected_indices` and `valid_outputs`. + * **Range of values**: `i64` or `i32` + * **Type**: `string` + * **Default value**: `i64` + * **Required**: *No* + +* *iou_threshold* + + * **Description**: intersection over union threshold. + * **Range of values**: a floating-point number + * **Type**: `float` + * **Default value**: `0` + * **Required**: *No* + +* *score_threshold* + + * **Description**: minimum score to consider box for the processing. + * **Range of values**: a floating-point number + * **Type**: `float` + * **Default value**: `0` + * **Required**: *No* + +* *nms_top_k* + + * **Description**: maximum number of boxes to be selected per class. + * **Range of values**: an integer + * **Type**: `int` + * **Default value**: `-1` meaning to keep all boxes + * **Required**: *No* + +* *keep_top_k* + + * **Description**: maximum number of boxes to be selected per batch element. + * **Range of values**: an integer + * **Type**: `int` + * **Default value**: `-1` meaning to keep all boxes + * **Required**: *No* + +* *background_class* + + * **Description**: the background class id. + * **Range of values**: an integer + * **Type**: `int` + * **Default value**: `-1` meaning to keep all classes. + * **Required**: *No* + +* *normalized* + + * **Description**: *normalized* is a flag that indicates whether `boxes` are normalized or not. + * **Range of values**: true or false + * *true* - the box coordinates are normalized. + * *false* - the box coordinates are not normalized. + * **Type**: boolean + * **Default value**: True + * **Required**: *No* + +* *nms_eta* + + * **Description**: eta parameter for adaptive NMS. + * **Range of values**: a floating-point number in close range `[0, 1.0]`. + * **Type**: `float` + * **Default value**: `1.0` + * **Required**: *No* + +**Inputs**: + +* **1**: `boxes` - tensor of type *T* and shape `[num_batches, num_boxes, 4]` with box coordinates. The box coordinates are layout as `[xmin, ymin, xmax, ymax]`. **Required.** + +* **2**: `scores` - tensor of type *T* and shape `[num_batches, num_classes, num_boxes]` with box scores. **Required.** + +**Outputs**: + +* **1**: `selected_outputs` - tensor of type *T_THRESHOLDS* and shape `[number of selected boxes, 6]` containing the selected boxes with score and class as tuples `[class_id, box_score, xmin, ymin, xmax, ymax]`. + +* **2**: `selected_indices` - tensor of type *T_IND* and shape `[number of selected boxes, 1]` the selected indices in the flattened `boxes`, which are absolute values cross batches. Therefore possible valid values are in the range `[0, num_batches * num_boxes - 1]`. + +* **3**: `selected_num` - 1D tensor of type *T_IND* and shape `[num_batches]` representing the number of selected boxes for each batch element. + +When there is no box selected, `selected_num` is filled with `0`. `selected_outputs` is an empty tensor of shape `[0, 6]`, and `selected_indices` is an empty tensor of shape `[0, 1]`. + +**Types** + +* *T*: floating point type. + +* *T_MAX_BOXES*: integer type. + +* *T_THRESHOLDS*: floating point type. + +* *T_IND*: `int64` or `int32`. + +**Example** + +```xml + + + + + 3 + 100 + 4 + + + 3 + 5 + 100 + + + + + -1 + 6 + + + -1 + 1 + + + 3 + + + +``` diff --git a/docs/optimization_guide/dldt_optimization_guide.md b/docs/optimization_guide/dldt_optimization_guide.md index e70c0365a4165c..3d7b41e7f8ccef 100644 --- a/docs/optimization_guide/dldt_optimization_guide.md +++ b/docs/optimization_guide/dldt_optimization_guide.md @@ -2,13 +2,13 @@ ## Introduction -The purpose of this document is to give you performance-related insights to every step of the network deployment process. +The purpose of this document is to give you performance-related insights to every step of the network deployment process. For information on the general workflow, refer to the documentation in See Also. For an example Inference Engine API snippet, see Request-Based API and “GetBlob” Idiom. ### Deep Learning Inference Engine Overview -Deep Learning Inference Engine is a part of Intel® Deep Learning Deployment Toolkit (Intel® DL Deployment Toolkit) and OpenVINO™ toolkit. Inference Engine facilitates deployment of deep learning solutions by delivering a unified, device-agnostic API. +Deep Learning Inference Engine is a part of Intel® Deep Learning Deployment Toolkit (Intel® DL Deployment Toolkit) and OpenVINO™ toolkit. Inference Engine facilitates deployment of deep learning solutions by delivering a unified, device-agnostic API. Below, there are the three main steps of the deployment process: @@ -50,7 +50,7 @@ When evaluating performance of your model with the Inference Engine, you must me ### Latency vs. Throughput -In the asynchronous case (see Request-Based API and “GetBlob” Idiom), the performance of an individual infer request is usually of less concern. Instead, you typically execute multiple requests asynchronously and measure the throughput in images per second by dividing the number of images that were processed by the processing time. +In the asynchronous case (see Request-Based API and “GetBlob” Idiom), the performance of an individual infer request is usually of less concern. Instead, you typically execute multiple requests asynchronously and measure the throughput in images per second by dividing the number of images that were processed by the processing time. In contrast, for the latency-oriented tasks, the time to a single frame is more important. Refer to the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample, which allows latency vs. throughput measuring. @@ -114,23 +114,23 @@ The resulting IR precision, for instance, `FP16` or `FP32`, directly affects per ## Multi-Device Execution OpenVINO™ toolkit supports automatic multi-device execution, please see [MULTI-Device plugin description](../IE_DG/supported_plugins/MULTI.md). -In the next chapter you can find the device-specific tips, while this section covers few recommendations +In the next chapter you can find the device-specific tips, while this section covers few recommendations for the multi-device execution: -- MULTI usually performs best when the fastest device is specified first in the list of the devices. - This is particularly important when the parallelism is not sufficient +- MULTI usually performs best when the fastest device is specified first in the list of the devices. + This is particularly important when the parallelism is not sufficient (e.g. the number of request in the flight is not enough to saturate all devices). -- It is highly recommended to query the optimal number of inference requests directly from the instance of the ExecutionNetwork - (resulted from the LoadNetwork call with the specific multi-device configuration as a parameter). -Please refer to the code of the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample for details. -- Notice that for example CPU+GPU execution performs better with certain knobs +- It is highly recommended to query the optimal number of inference requests directly from the instance of the ExecutionNetwork + (resulted from the LoadNetwork call with the specific multi-device configuration as a parameter). +Please refer to the code of the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample for details. +- Notice that for example CPU+GPU execution performs better with certain knobs which you can find in the code of the same [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample. - One specific example is disabling GPU driver polling, which in turn requires multiple GPU streams (which is already a default for the GPU) to amortize slower + One specific example is disabling GPU driver polling, which in turn requires multiple GPU streams (which is already a default for the GPU) to amortize slower inference completion from the device to the host. -- Multi-device logic always attempts to save on the (e.g. inputs) data copies between device-agnostic, user-facing inference requests - and device-specific 'worker' requests that are being actually scheduled behind the scene. - To facilitate the copy savings, it is recommended to start the requests in the order that they were created +- Multi-device logic always attempts to save on the (e.g. inputs) data copies between device-agnostic, user-facing inference requests + and device-specific 'worker' requests that are being actually scheduled behind the scene. + To facilitate the copy savings, it is recommended to start the requests in the order that they were created (with ExecutableNetwork's CreateInferRequest). - + ## Device-Specific Optimizations @@ -142,7 +142,7 @@ CPU plugin completely relies on the Intel® Math Kernel Library for Deep Neur The only hint you can get from that is how the major primitives are accelerated (and you cannot change this). For example, on the Core machines, you should see variations of the `jit_avx2` when inspecting the internal inference performance counters (and additional '_int8' postfix for [int8 inference](../IE_DG/Int8Inference.md)). If you are an advanced user, you can further trace the CPU execution with (see Intel® VTune™). -Internally, the Inference Engine has a threading abstraction level, which allows for compiling the [open source version](https://github.com/opencv/dldt) with either Intel® Threading Building Blocks (Intel® TBB) which is now default, or OpenMP* as an alternative parallelism solution. When using inference on the CPU, this is particularly important to align threading model with the rest of your application (and any third-party libraries that you use) to avoid oversubscription. For more information, see Note on the App-Level Threading section. +Internally, the Inference Engine has a threading abstraction level, which allows for compiling the [open source version](https://github.com/openvinotoolkit/openvino) with either Intel® Threading Building Blocks (Intel® TBB) which is now default, or OpenMP* as an alternative parallelism solution. When using inference on the CPU, this is particularly important to align threading model with the rest of your application (and any third-party libraries that you use) to avoid oversubscription. For more information, see Note on the App-Level Threading section. Since R1 2019, the OpenVINO™ toolkit comes pre-compiled with Intel TBB, so any OpenMP* API or environment settings (like `OMP_NUM_THREADS`) has no effect. @@ -171,7 +171,7 @@ Notice that on a multi-socket machine, the bare minimum of streams for a latency In addition, you can play with the batch size to find the throughput sweet spot. -If your application is hard or impossible to change in accordance with the multiple-requests logic, consider the "multiple-instance" trick to improve the throughput: +If your application is hard or impossible to change in accordance with the multiple-requests logic, consider the "multiple-instance" trick to improve the throughput: - For multi-socket execution, it is recommended to set [`KEY_CPU_THREADS_NUM`](../IE_DG/supported_plugins/CPU.md) to the number of cores per socket, and run as many instances of the application as you have sockets. - Similarly, for extremely lightweight networks (running faster than 1ms) and/or many-core machines (16+ cores), try limiting the number of CPU inference threads to just `#‍phys` cores and further, while trying to saturate the machine with running multiple instances of the application. @@ -186,15 +186,15 @@ Inference Engine relies on the [Compute Library for Deep Neural Networks (clDNN) - If your application is simultaneously using the inference on the CPU or otherwise loads the host heavily, make sure that the OpenCL driver threads do not starve. You can use [CPU configuration options](../IE_DG/supported_plugins/CPU.md) to limit number of inference threads for the CPU plugin. - In the GPU-only scenario, a GPU driver might occupy a CPU core with spin-looped polling for completion. If the _CPU_ utilization is a concern, consider the `KEY_CLDND_PLUGIN_THROTTLE` configuration option. -> **NOTE**: See the [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) code for a usage example. -Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/CL_DNN.md). +> **NOTE**: See the [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) code for a usage example. +Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/GPU.md). ### Intel® Movidius™ Myriad™ X Visual Processing Unit and Intel® Vision Accelerator Design with Intel® Movidius™ VPUs Since Intel® Movidius™ Myriad™ X Visual Processing Unit (Intel® Movidius™ Myriad™ 2 VPU) communicates with the host over USB, minimum four infer requests in flight are recommended to hide the data transfer costs. See Request-Based API and “GetBlob” Idiom and [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) for more information. -Intel® Vision Accelerator Design with Intel® Movidius™ VPUs requires to keep at least 32 inference requests in flight to fully saturate the device. +Intel® Vision Accelerator Design with Intel® Movidius™ VPUs requires to keep at least 32 inference requests in flight to fully saturate the device. ### FPGA @@ -274,7 +274,7 @@ The following tips are provided to give general guidance on optimizing execution - Generally, GPU performance is better on heavy kernels (like Convolutions) and large inputs. So if the network inference time is already too small (~1ms of execution time), using the GPU would unlikely give a boost. -- A typical strategy to start with is to test the CPU-only and GPU-only scenarios first (with samples this is plain `-d CPU` or `-d GPU`). If there are specific kernels that are not supported by the GPU, the best option to try is the `HETERO:GPU,CPU` that automatically applies default splitting (based on the plugins layers support). Then, you can play with the manual affinity settings (for example, to further minimize the number of subgraphs). +- A typical strategy to start with is to test the CPU-only and GPU-only scenarios first (with samples this is plain `-d CPU` or `-d GPU`). If there are specific kernels that are not supported by the GPU, the best option to try is the `HETERO:GPU,CPU` that automatically applies default splitting (based on the plugins layers support). Then, you can play with the manual affinity settings (for example, to further minimize the number of subgraphs). - The general affinity “rule of thumb” is to keep computationally-intensive kernels on the accelerator, and "glue" (or helper) kernels on the CPU. Notice that this includes the granularity considerations. For example, running some (custom) activation on the CPU would result in too many conversions. @@ -337,7 +337,7 @@ For inference on the CPU there are multiple threads binding options, see If you are building an app-level pipeline with third-party components like GStreamer*, the general guidance for NUMA machines is as follows: - Whenever possible, use at least one instance of the pipeline per NUMA node: - - Pin the _entire_ pipeline instance to the specific NUMA node at the outer-most level (for example, use Kubernetes* and/or `numactl` command with proper settings before actual GStreamer commands). + - Pin the _entire_ pipeline instance to the specific NUMA node at the outer-most level (for example, use Kubernetes* and/or `numactl` command with proper settings before actual GStreamer commands). - Disable any individual pinning by the pipeline components (e.g. set [CPU_BIND_THREADS to 'NO'](../IE_DG/supported_plugins/CPU.md)). - Limit each instance with respect to number of inference threads. Use [CPU_THREADS_NUM](../IE_DG/supported_plugins/CPU.md) or or other means (e.g. virtualization, Kubernetes*, etc), to avoid oversubscription. - If pinning instancing/pinning of the entire pipeline is not possible or desirable, relax the inference threads pinning to just 'NUMA'. @@ -348,7 +348,7 @@ If you are building an app-level pipeline with third-party components like GStre - As explained in the CPU Checklist section, by default the Inference Engine uses Intel TBB as a parallel engine. Thus, any OpenVINO-internal threading (including CPU inference) uses the same threads pool, provided by the TBB. But there are also other threads in your application, so oversubscription is possible at the application level: - The rule of thumb is that you should try to have the overall number of active threads in your application equal to the number of cores in your machine. Keep in mind the spare core(s) that the OpenCL driver under the GPU plugin might also need. - One specific workaround to limit the number of threads for the Inference Engine is using the [CPU configuration options](../IE_DG/supported_plugins/CPU.md). -- To avoid further oversubscription, use the same threading model in all modules/libraries that your application uses. Notice that third party components might bring their own threading. For example, using Inference Engine which is now compiled with the TBB by default might lead to [performance troubles](https://www.threadingbuildingblocks.org/docs/help/reference/appendices/known_issues/interoperability.html) when mixed in the same app with another computationally-intensive library, but compiled with OpenMP. You can try to compile the [open source version](https://github.com/opencv/dldt) of the Inference Engine to use the OpenMP as well. But notice that in general, the TBB offers much better composability, than other threading solutions. +- To avoid further oversubscription, use the same threading model in all modules/libraries that your application uses. Notice that third party components might bring their own threading. For example, using Inference Engine which is now compiled with the TBB by default might lead to [performance troubles](https://www.threadingbuildingblocks.org/docs/help/reference/appendices/known_issues/interoperability.html) when mixed in the same app with another computationally-intensive library, but compiled with OpenMP. You can try to compile the [open source version](https://github.com/openvinotoolkit/openvino) of the Inference Engine to use the OpenMP as well. But notice that in general, the TBB offers much better composability, than other threading solutions. - If your code (or third party libraries) uses GNU OpenMP, the Intel® OpenMP (if you have recompiled Inference Engine with that) must be initialized first. This can be achieved by linking your application with the Intel OpenMP instead of GNU OpenMP, or using `LD_PRELOAD` on Linux* OS. ### Letting the Inference Engine Accelerate Image Pre-processing/Conversion @@ -416,7 +416,7 @@ If your application simultaneously executes multiple infer requests: - For FPGA and GPU, the actual work is serialized by a plugin and/or a driver anyway. -- Finally, for any VPU flavor, using multiple requests is a must for achieving good throughput. +- Finally, for any VPU flavor, using multiple requests is a must for achieving good throughput. In the Inference Engine, there is no notion of requests priorities. It is left to the user side (for example, not queuing the low priority infer request, until another higher priority is waiting). Notice that it would require additional logic to synchronize between executable networks (queues) in your application code. @@ -470,12 +470,12 @@ Example of Inference Engine calls: Notice that `Task_runNOThrow` is an Async API wrapper and it is executed in a different thread and triggers the Intel MKL-DNN execution: ![](../img/vtune_timeline.png) - + - In the Intel VTune Amplifier **Top-down view**, grouped by the **Task Domain**. Notice the `Task_runNoThrow` and `MKLDNN _INFER` that are bracketing the actual Intel MKL-DNN kernels execution: - + ![](../img/vtune_topdown_view.jpg) - + Similarly, you can use any GPU analysis in the Intel VTune Amplifier and get general correlation with Inference Engine API as well as the execution breakdown for OpenCL kernels. Just like with regular native application, further drill down in the counters is possible, however, this is mostly useful for optimizing custom kernels. Finally, with the Intel VTune Amplifier, the profiling is not limited to your user-level code (see the [corresponding section in the Intel® VTune™ Amplifier User's Guide](https://software.intel.com/en-us/vtune-amplifier-help-analyze-performance)). @@ -513,12 +513,12 @@ Since FPGA execution does not separate individual kernels, only bulk execution/d ``` subgraph1: 1. input preprocessing (mean data/FPGA):EXECUTED layerType: preprocessing realTime: 129 cpu: 129 -subgraph1: 2. input transfer to DDR:EXECUTED layerType: realTime: 201 cpu: 0 -subgraph1: 3. FPGA execute time:EXECUTED layerType: realTime: 3808 cpu: 0 subgraph1: 4. output transfer from DDR:EXECUTED layerType: realTime: 55 cpu: 0 -subgraph1: 5. FPGA output postprocessing:EXECUTED layerType: realTime: 7 cpu: 7 -subgraph1: 6. softmax/copy: EXECUTED layerType: realTime: 2 cpu: 2 -subgraph2: out_prob: NOT_RUN layerType: Output realTime: 0 cpu: 0 -subgraph2: prob: EXECUTED layerType: SoftMax realTime: 10 cpu: 10 +subgraph1: 2. input transfer to DDR:EXECUTED layerType: realTime: 201 cpu: 0 +subgraph1: 3. FPGA execute time:EXECUTED layerType: realTime: 3808 cpu: 0 subgraph1: 4. output transfer from DDR:EXECUTED layerType: realTime: 55 cpu: 0 +subgraph1: 5. FPGA output postprocessing:EXECUTED layerType: realTime: 7 cpu: 7 +subgraph1: 6. softmax/copy: EXECUTED layerType: realTime: 2 cpu: 2 +subgraph2: out_prob: NOT_RUN layerType: Output realTime: 0 cpu: 0 +subgraph2: prob: EXECUTED layerType: SoftMax realTime: 10 cpu: 10 Total time: 4212 microseconds ``` diff --git a/docs/snippets/CMakeLists.txt b/docs/snippets/CMakeLists.txt index 1d2a20eea0a3d7..48edae1e832547 100644 --- a/docs/snippets/CMakeLists.txt +++ b/docs/snippets/CMakeLists.txt @@ -26,17 +26,6 @@ if(NOT OpenCV_FOUND) "${CMAKE_CURRENT_SOURCE_DIR}/ShapeInference.cpp") endif() -# ONNX importer related files -if(NOT NGRAPH_ONNX_IMPORT_ENABLE) - list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/OnnxImporterTutorial0.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/OnnxImporterTutorial1.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/OnnxImporterTutorial2.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/OnnxImporterTutorial3.cpp") -endif() - -# remove snippets for deprecated / removed API -list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/Migration_CoreAPI.cpp") - # requires mfxFrameSurface1 and MSS API list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/dldt_optimization_guide2.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/dldt_optimization_guide3.cpp" diff --git a/docs/snippets/GPU_Kernel.cpp b/docs/snippets/GPU_Kernel.cpp index 5f849eb6a6a6a9..8b21a79dfe27dd 100644 --- a/docs/snippets/GPU_Kernel.cpp +++ b/docs/snippets/GPU_Kernel.cpp @@ -1,5 +1,4 @@ #include -#include "cldnn/cldnn_config.hpp" int main() { using namespace InferenceEngine; @@ -9,9 +8,5 @@ InferenceEngine::Core core; core.SetConfig({ { InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE, "" } }, "GPU"); //! [part0] -//! [part1] -core.SetConfig({ { PluginConfigParams::KEY_DUMP_KERNELS, PluginConfigParams::YES } }, "GPU"); -//! [part1] - return 0; } diff --git a/docs/snippets/GPU_Kernels_Tuning.cpp b/docs/snippets/GPU_Kernels_Tuning.cpp deleted file mode 100644 index 25daeec5e2a263..00000000000000 --- a/docs/snippets/GPU_Kernels_Tuning.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include -#include "cldnn/cldnn_config.hpp" - -int main() { -using namespace InferenceEngine; -//! [part0] -Core ie; - ie.SetConfig({{ CONFIG_KEY(TUNING_MODE), CONFIG_VALUE(TUNING_CREATE) }}, "GPU"); - ie.SetConfig({{ CONFIG_KEY(TUNING_FILE), "/path/to/tuning/file.json" }}, "GPU"); - // Further LoadNetwork calls will use the specified tuning parameters -//! [part0] - -return 0; -} diff --git a/docs/snippets/GPU_RemoteBlob_API2.cpp b/docs/snippets/GPU_RemoteBlob_API2.cpp index 1bb00c17e03e94..13597ae45617ba 100644 --- a/docs/snippets/GPU_RemoteBlob_API2.cpp +++ b/docs/snippets/GPU_RemoteBlob_API2.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include int main() { @@ -28,7 +28,7 @@ auto shared_va_context = gpu::make_shared_context(ie, "GPU", disp); // compile network within a shared context ExecutableNetwork executable_network = ie.LoadNetwork(network, shared_va_context, - { { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS, + { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES } }); diff --git a/docs/snippets/InferenceEngine_Caching0.cpp b/docs/snippets/InferenceEngine_Caching0.cpp new file mode 100644 index 00000000000000..5311a3d0bb681c --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching0.cpp @@ -0,0 +1,17 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string device = "GNA"; + std::map deviceConfig; +//! [part0] + InferenceEngine::Core ie; // Step 1: create Inference engine object + ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "myCacheFolder"}}); // Step 1b: Enable caching + auto cnnNet = ie.ReadNetwork(modelPath); // Step 2: ReadNetwork + //... // Step 3: Prepare inputs/outputs + //... // Step 4: Set device configuration + ie.LoadNetwork(cnnNet, device, deviceConfig); // Step 5: LoadNetwork +//! [part0] +return 0; +} diff --git a/docs/snippets/InferenceEngine_Caching1.cpp b/docs/snippets/InferenceEngine_Caching1.cpp new file mode 100644 index 00000000000000..3c9d0c5b22d558 --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching1.cpp @@ -0,0 +1,13 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string device = "GNA"; + std::map deviceConfig; +//! [part1] + InferenceEngine::Core ie; // Step 1: create Inference engine object + ie.LoadNetwork(modelPath, device, deviceConfig); // Step 2: LoadNetwork by model file path +//! [part1] +return 0; +} diff --git a/docs/snippets/InferenceEngine_Caching2.cpp b/docs/snippets/InferenceEngine_Caching2.cpp new file mode 100644 index 00000000000000..aaf4b33c10da90 --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching2.cpp @@ -0,0 +1,14 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string device = "GNA"; + std::map deviceConfig; +//! [part2] + InferenceEngine::Core ie; // Step 1: create Inference engine object + ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "myCacheFolder"}}); // Step 1b: Enable caching + ie.LoadNetwork(modelPath, device, deviceConfig); // Step 2: LoadNetwork by model file path +//! [part2] +return 0; +} diff --git a/docs/snippets/InferenceEngine_Caching3.cpp b/docs/snippets/InferenceEngine_Caching3.cpp new file mode 100644 index 00000000000000..ce91a798552c79 --- /dev/null +++ b/docs/snippets/InferenceEngine_Caching3.cpp @@ -0,0 +1,20 @@ +#include + +int main() { +using namespace InferenceEngine; + std::string modelPath = "/tmp/myModel.xml"; + std::string deviceName = "GNA"; + std::map deviceConfig; + InferenceEngine::Core ie; +//! [part3] + // Get list of supported metrics + std::vector keys = ie.GetMetric(deviceName, METRIC_KEY(SUPPORTED_METRICS)); + + // Find 'IMPORT_EXPORT_SUPPORT' metric in supported metrics + auto it = std::find(keys.begin(), keys.end(), METRIC_KEY(IMPORT_EXPORT_SUPPORT)); + + // If metric 'IMPORT_EXPORT_SUPPORT' exists, check it's value + bool cachingSupported = (it != keys.end()) && ie.GetMetric(deviceName, METRIC_KEY(IMPORT_EXPORT_SUPPORT)); +//! [part3] + return 0; +} diff --git a/docs/snippets/InferenceEngine_network_with_state_infer.cpp b/docs/snippets/InferenceEngine_network_with_state_infer.cpp index 81a3070ba3b319..7af9c076931169 100644 --- a/docs/snippets/InferenceEngine_network_with_state_infer.cpp +++ b/docs/snippets/InferenceEngine_network_with_state_infer.cpp @@ -64,7 +64,13 @@ int main(int argc, char *argv[]) { inferRequest.Infer(); // check states auto states = inferRequest.QueryState(); + if (states.empty()) { + throw std::runtime_error("Queried states are empty"); + } auto mstate = as(states[0].GetState()); + if (mstate == nullptr) { + throw std::runtime_error("Can't cast state to MemoryBlob"); + } auto state_buf = mstate->rmap(); float * state =state_buf.as(); std::cout << state[0] << "\n"; diff --git a/docs/snippets/Migration_CoreAPI.cpp b/docs/snippets/Migration_CoreAPI.cpp deleted file mode 100644 index fd89803093b307..00000000000000 --- a/docs/snippets/Migration_CoreAPI.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include - -int main() { -std::string deviceName = "Device name"; -//! [part0] -InferenceEngine::InferencePlugin plugin = InferenceEngine::PluginDispatcher({ FLAGS_pp }).getPluginByDevice(FLAGS_d); -//! [part0] - -//! [part1] -InferenceEngine::Core core; -//! [part1] - -//! [part2] -InferenceEngine::CNNNetReader network_reader; -network_reader.ReadNetwork(fileNameToString(input_model)); -network_reader.ReadWeights(fileNameToString(input_model).substr(0, input_model.size() - 4) + ".bin"); -InferenceEngine::CNNNetwork network = network_reader.getNetwork(); -//! [part2] - -//! [part3] -InferenceEngine::CNNNetwork network = core.ReadNetwork(input_model); -//! [part3] - -//! [part4] -InferenceEngine::CNNNetwork network = core.ReadNetwork("model.onnx"); -//! [part4] - -//! [part5] -plugin.AddExtension(std::make_shared()); -//! [part5] - -//! [part6] -core.AddExtension(std::make_shared(), "CPU"); -//! [part6] - -//! [part7] -core.SetConfig({{PluginConfigParams::KEY_CONFIG_FILE, FLAGS_c}}, "GPU"); -//! [part7] - -//! [part8] -auto execNetwork = plugin.LoadNetwork(network, { }); -//! [part8] - -//! [part9] -auto execNetwork = core.LoadNetwork(network, deviceName, { }); -//! [part9] -return 0; -} diff --git a/docs/snippets/OnnxImporterTutorial0.cpp b/docs/snippets/OnnxImporterTutorial0.cpp deleted file mode 100644 index cf434622cb9395..00000000000000 --- a/docs/snippets/OnnxImporterTutorial0.cpp +++ /dev/null @@ -1,19 +0,0 @@ -#include -#include -#include "onnx_import/onnx.hpp" -#include -#include - -int main() { -//! [part0] -const std::int64_t version = 12; -const std::string domain = "ai.onnx"; -const std::set supported_ops = ngraph::onnx_import::get_supported_operators(version, domain); - -for(const auto& op : supported_ops) -{ - std::cout << op << std::endl; -} -//! [part0] -return 0; -} diff --git a/docs/snippets/OnnxImporterTutorial1.cpp b/docs/snippets/OnnxImporterTutorial1.cpp deleted file mode 100644 index 60122f1a1ea025..00000000000000 --- a/docs/snippets/OnnxImporterTutorial1.cpp +++ /dev/null @@ -1,15 +0,0 @@ -#include -#include -#include "onnx_import/onnx.hpp" - -int main() { -//! [part1] -const std::string op_name = "Abs"; -const std::int64_t version = 12; -const std::string domain = "ai.onnx"; -const bool is_abs_op_supported = ngraph::onnx_import::is_operator_supported(op_name, version, domain); - -std::cout << "Abs in version 12, domain `ai.onnx`is supported: " << (is_abs_op_supported ? "true" : "false") << std::endl; -//! [part1] -return 0; -} diff --git a/docs/snippets/OnnxImporterTutorial2.cpp b/docs/snippets/OnnxImporterTutorial2.cpp deleted file mode 100644 index 00ce2949a1d163..00000000000000 --- a/docs/snippets/OnnxImporterTutorial2.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include "onnx_import/onnx.hpp" -#include -#include - -int main() { -//! [part2] - const char * resnet50_path = "resnet50/model.onnx"; - std::ifstream resnet50_stream(resnet50_path); - if (resnet50_stream.is_open()) - { - try - { - const std::shared_ptr ng_function = ngraph::onnx_import::import_onnx_model(resnet50_stream); - - // Check shape of the first output, for example - std::cout << ng_function->get_output_shape(0) << std::endl; - // The output is Shape{1, 1000} - } - catch (const ngraph::ngraph_error& error) - { - std::cout << "Error when importing ONNX model: " << error.what() << std::endl; - } - } - resnet50_stream.close(); -//! [part2] -return 0; -} diff --git a/docs/snippets/OnnxImporterTutorial3.cpp b/docs/snippets/OnnxImporterTutorial3.cpp deleted file mode 100644 index 6fc1e1b59de907..00000000000000 --- a/docs/snippets/OnnxImporterTutorial3.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include -#include "onnx_import/onnx.hpp" -#include - -int main() { -//! [part3] -const char * resnet50_path = "resnet50/model.onnx"; -const std::shared_ptr ng_function = ngraph::onnx_import::import_onnx_model(resnet50_path); -//! [part3] -return 0; -} diff --git a/docs/template_plugin/src/template_executable_network.cpp b/docs/template_plugin/src/template_executable_network.cpp index e46bd63e5a0faa..4aba4622e50428 100644 --- a/docs/template_plugin/src/template_executable_network.cpp +++ b/docs/template_plugin/src/template_executable_network.cpp @@ -175,9 +175,9 @@ InferenceEngine::Parameter TemplatePlugin::ExecutableNetwork::GetMetric(const st } // ! [executable_network:get_metric] -// ! [executable_network:export_impl] -void TemplatePlugin::ExecutableNetwork::ExportImpl(std::ostream& modelStream) { - OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "ExecutableNetwork::ExportImpl"); +// ! [executable_network:export] +void TemplatePlugin::ExecutableNetwork::Export(std::ostream& modelStream) { + OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "ExecutableNetwork::Export"); // Note: custom ngraph extensions are not supported std::map custom_opsets; @@ -198,4 +198,4 @@ void TemplatePlugin::ExecutableNetwork::ExportImpl(std::ostream& modelStream) { // TODO: implement network precision, layout, preprocessing info serialization } -// ! [executable_network:export_impl] +// ! [executable_network:export] diff --git a/docs/template_plugin/src/template_executable_network.hpp b/docs/template_plugin/src/template_executable_network.hpp index ca3bca11ba847f..a68df02f958934 100644 --- a/docs/template_plugin/src/template_executable_network.hpp +++ b/docs/template_plugin/src/template_executable_network.hpp @@ -30,7 +30,7 @@ class ExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafeDef // Methods from a base class ExecutableNetworkThreadSafeDefault - void ExportImpl(std::ostream& model) override; + void Export(std::ostream& model) override; InferenceEngine::IInferRequestInternal::Ptr CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs, InferenceEngine::OutputsDataMap networkOutputs) override; InferenceEngine::IInferRequestInternal::Ptr CreateInferRequest() override; diff --git a/docs/template_plugin/src/template_plugin.cpp b/docs/template_plugin/src/template_plugin.cpp index 87a509c8a77bee..a0f7a30ee171cf 100644 --- a/docs/template_plugin/src/template_plugin.cpp +++ b/docs/template_plugin/src/template_plugin.cpp @@ -95,14 +95,14 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(cons } // ! [plugin:load_exe_network_impl] -// ! [plugin:import_network_impl] -InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::ImportNetworkImpl(std::istream& modelStream, const std::map& config) { - OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::ImportNetworkImpl"); +// ! [plugin:import_network] +InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::ImportNetwork(std::istream& modelStream, const std::map& config) { + OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::ImportNetwork"); auto fullConfig = Configuration {config, _cfg}; return std::make_shared(modelStream, fullConfig, std::static_pointer_cast(shared_from_this())); } -// ! [plugin:import_network_impl] +// ! [plugin:import_network] // ! [plugin:query_network] InferenceEngine::QueryNetworkResult Plugin::QueryNetwork(const InferenceEngine::CNNNetwork& network, const ConfigMap& config) const { diff --git a/docs/template_plugin/src/template_plugin.hpp b/docs/template_plugin/src/template_plugin.hpp index ef2b506d497500..71c37410ea717e 100644 --- a/docs/template_plugin/src/template_plugin.hpp +++ b/docs/template_plugin/src/template_plugin.hpp @@ -28,7 +28,7 @@ class Plugin : public InferenceEngine::IInferencePlugin { void AddExtension(const std::shared_ptr& extension) override; InferenceEngine::Parameter GetConfig(const std::string& name, const std::map& options) const override; InferenceEngine::Parameter GetMetric(const std::string& name, const std::map& options) const override; - InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& model, const std::map& config) override; + InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& model, const std::map& config) override; private: friend class ExecutableNetwork; diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/caching_tests.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/caching_tests.cpp index f61e4c54d7ec81..a698d72e52610d 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/caching_tests.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/caching_tests.cpp @@ -15,7 +15,7 @@ namespace { 1, 2 }; - INSTANTIATE_TEST_CASE_P(smoke_CachingSupportCase_Template, LoadNetworkCacheTestBase, + INSTANTIATE_TEST_SUITE_P(smoke_CachingSupportCase_Template, LoadNetworkCacheTestBase, ::testing::Combine( ::testing::ValuesIn(LoadNetworkCacheTestBase::getStandardFunctions()), ::testing::ValuesIn(precisionsTemplate), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/config.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/config.cpp index fad461c814da58..34e0eb46eace0c 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/config.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/config.cpp @@ -24,14 +24,14 @@ const std::vector> inconfigs = { {{TEMPLATE_CONFIG_KEY(THROUGHPUT_STREAMS), CONFIG_VALUE(NO)}}, }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, IncorrectConfigTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, IncorrectConfigTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), ::testing::ValuesIn(inconfigs)), IncorrectConfigTests::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, IncorrectConfigAPITests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, IncorrectConfigAPITests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), @@ -39,14 +39,14 @@ INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, IncorrectConfigAPITests, IncorrectConfigAPITests::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, CorrectConfigAPITests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, CorrectConfigAPITests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), ::testing::ValuesIn(configs)), CorrectConfigAPITests::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_Multi_BehaviorTests, CorrectConfigTests, +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, CorrectConfigTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/core_integration.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/core_integration.cpp index 2b669a6520a376..2c067aaf7b68ab 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/core_integration.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/core_integration.cpp @@ -16,11 +16,11 @@ namespace { // IE Class Common tests with // -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassBasicTestP, IEClassBasicTestP, ::testing::Values(std::make_pair("templatePlugin", CommonTestUtils::DEVICE_TEMPLATE))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassNetworkTestP, IEClassNetworkTestP, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); @@ -28,39 +28,39 @@ INSTANTIATE_TEST_CASE_P( // IE Class GetMetric // -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetMetricTest, IEClassGetMetricTest_SUPPORTED_CONFIG_KEYS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetMetricTest, IEClassGetMetricTest_SUPPORTED_METRICS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetMetricTest, IEClassGetMetricTest_AVAILABLE_DEVICES, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetMetricTest, IEClassGetMetricTest_FULL_DEVICE_NAME, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetMetricTest, IEClassGetMetricTest_OPTIMIZATION_CAPABILITIES, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetMetricTest, IEClassGetMetricTest_RANGE_FOR_ASYNC_INFER_REQUESTS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetMetricTest, IEClassGetMetricTest_ThrowUnsupported, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetConfigTest, IEClassGetConfigTest_ThrowUnsupported, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetAvailableDevices, IEClassGetAvailableDevices, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); @@ -111,7 +111,7 @@ TEST_F(IEClassSetConfigTestHETERO, smoke_SetConfigNoThrow) { // IE Class GetConfig // -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassGetConfigTest, IEClassGetConfigTest, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); @@ -143,46 +143,46 @@ TEST_F(IEClassGetConfigTestTEMPLATE, smoke_GetConfigNoThrow) { // Executable Network GetMetric // -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassExecutableNetworkGetMetricTest, IEClassExecutableNetworkGetMetricTest_SUPPORTED_CONFIG_KEYS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "MULTI:TEMPLATE", "HETERO:TEMPLATE")); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassExecutableNetworkGetMetricTest, IEClassExecutableNetworkGetMetricTest_SUPPORTED_METRICS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "MULTI:TEMPLATE", "HETERO:TEMPLATE")); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassExecutableNetworkGetMetricTest, IEClassExecutableNetworkGetMetricTest_NETWORK_NAME, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "MULTI:TEMPLATE", "HETERO:TEMPLATE")); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassExecutableNetworkGetMetricTest, IEClassExecutableNetworkGetMetricTest_OPTIMAL_NUMBER_OF_INFER_REQUESTS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "MULTI:TEMPLATE", "HETERO:TEMPLATE")); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassExecutableNetworkGetMetricTest_ThrowsUnsupported, IEClassExecutableNetworkGetMetricTest, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "MULTI:TEMPLATE", "HETERO:TEMPLATE")); // // Executable Network GetConfig / SetConfig // -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassExecutableNetworkGetConfigTest, IEClassExecutableNetworkGetConfigTest, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassExecutableNetworkSetConfigTest, IEClassExecutableNetworkSetConfigTest, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); // IE Class Query network -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassQueryNetworkTest, IEClassQueryNetworkTest, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); // IE Class Load network -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassLoadNetworkTest, IEClassLoadNetworkTest, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); @@ -192,19 +192,19 @@ INSTANTIATE_TEST_CASE_P( #ifdef ENABLE_MKL_DNN -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassHeteroExecutableNetworlGetMetricTest, IEClassHeteroExecutableNetworkGetMetricTest_SUPPORTED_CONFIG_KEYS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassHeteroExecutableNetworlGetMetricTest, IEClassHeteroExecutableNetworkGetMetricTest_SUPPORTED_METRICS, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassHeteroExecutableNetworlGetMetricTest, IEClassHeteroExecutableNetworkGetMetricTest_NETWORK_NAME, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_IEClassHeteroExecutableNetworlGetMetricTest, IEClassHeteroExecutableNetworkGetMetricTest_TARGET_FALLBACK, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)); diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/cpp_holders.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/cpp_holders.cpp index 8a2ee657b7917a..7c372c708ad3bb 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/cpp_holders.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/cpp_holders.cpp @@ -20,19 +20,19 @@ const std::vector> orders = { {2, 1, 0} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, HoldersTest, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, HoldersTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), ::testing::ValuesIn(orders)), HoldersTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, HoldersTestImportNetwork, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, HoldersTestImportNetwork, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "HETERO:TEMPLATE"), ::testing::ValuesIn(orders)), HoldersTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, HoldersTestOnImportedNetwork, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, HoldersTestOnImportedNetwork, ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "HETERO:TEMPLATE"), HoldersTestOnImportedNetwork::getTestCaseName); diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/exec_graph_info.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/exec_graph_info.cpp index a0aa412fb0d6e9..597e029fc5be53 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/exec_graph_info.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/exec_graph_info.cpp @@ -19,7 +19,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, ExecGraphTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, ExecGraphTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request.cpp index 2b39ef540057b0..edb803a614f689 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request.cpp @@ -19,7 +19,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, InferRequestTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_callback.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_callback.cpp index 8fd635ae8bf481..3bc9d1f6f02d3f 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_callback.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_callback.cpp @@ -19,7 +19,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, CallbackTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, CallbackTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_config.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_config.cpp index 3ee70738e5a0d5..a6ffbe80bbcec9 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_config.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_config.cpp @@ -19,7 +19,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, InferConfigTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferConfigTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_input.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_input.cpp index dde2fa6d2cd3f2..23d5cb7136faf4 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_input.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_input.cpp @@ -17,7 +17,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, InferRequestInputTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestInputTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_output.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_output.cpp index 9c8a24413fa647..eae52cec60c14e 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_output.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/infer_request_output.cpp @@ -17,7 +17,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, InferRequestOutputTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, InferRequestOutputTests, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/layout.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/layout.cpp index c5770ede2a0956..f4d8091b20e088 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/layout.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/layout.cpp @@ -26,7 +26,7 @@ const std::vector> inputShapes = { { 3 } }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, LayoutTest, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, LayoutTest, ::testing::Combine( ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/preprocessing.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/preprocessing.cpp index 29651f6da34f92..4aa9439b1ad891 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/preprocessing.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/preprocessing.cpp @@ -17,7 +17,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_PreprocessingPrecisionConvertTestsViaSetInput, PreprocessingPrecisionConvertTest, +INSTANTIATE_TEST_SUITE_P(smoke_PreprocessingPrecisionConvertTestsViaSetInput, PreprocessingPrecisionConvertTest, ::testing::Combine( ::testing::ValuesIn(inputPrecisions), ::testing::Values(4), // Number of input tensor channels @@ -26,7 +26,7 @@ INSTANTIATE_TEST_CASE_P(smoke_PreprocessingPrecisionConvertTestsViaSetInput, Pre ::testing::ValuesIn(configs)), PreprocessingPrecisionConvertTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_PreprocessingPrecisionConvertTestsViaGetBlob, PreprocessingPrecisionConvertTest, +INSTANTIATE_TEST_SUITE_P(smoke_PreprocessingPrecisionConvertTestsViaGetBlob, PreprocessingPrecisionConvertTest, ::testing::Combine( ::testing::ValuesIn(inputPrecisions), ::testing::Values(4), // Number of input tensor channels (blob_copy only supports 4d and 5d tensors) diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/set_preprocess.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/set_preprocess.cpp index a33bcd7638cc8a..19841bb64da643 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/set_preprocess.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/set_preprocess.cpp @@ -26,21 +26,21 @@ const std::vector> heteroConfigs = { {{ "TARGET_FALLBACK", CommonTestUtils::DEVICE_TEMPLATE }} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, PreprocessTest, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, PreprocessTest, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), ::testing::ValuesIn(configs)), PreprocessTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_Multi_BehaviorTests, PreprocessTest, +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, PreprocessTest, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_MULTI), ::testing::ValuesIn(multiConfigs)), PreprocessTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_Hetero_BehaviorTests, PreprocessTest, +INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, PreprocessTest, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_HETERO), @@ -61,7 +61,7 @@ const std::vector ioLayouts = { InferenceEngine::Layout::NHWC }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, PreprocessConversionTest, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, PreprocessConversionTest, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::ValuesIn(ioPrecisions), @@ -75,7 +75,7 @@ INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, PreprocessConversionTest, ::testing::ValuesIn(configs)), PreprocessConversionTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, PreprocessDynamicallyInSetBlobTest, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, PreprocessDynamicallyInSetBlobTest, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Bool(), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/test_plugin.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/test_plugin.cpp index 8d65895bc36b04..b8d5e9bc9bdb5a 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/test_plugin.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/test_plugin.cpp @@ -17,21 +17,21 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, BehaviorTests, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, BehaviorTests, ::testing::Combine( ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), ::testing::ValuesIn(configs)), BehaviorTests::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, BehaviorTestInput, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, BehaviorTestInput, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), ::testing::ValuesIn(configs)), BehaviorTestInput::getTestCaseName); -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, BehaviorTestOutput, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, BehaviorTestOutput, ::testing::Combine( ::testing::ValuesIn(netPrecisions), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/version.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/version.cpp index 818a627a51973a..8b22d2eb24094e 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/behavior/version.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/behavior/version.cpp @@ -12,7 +12,7 @@ const std::vector> configs = { {} }; -INSTANTIATE_TEST_CASE_P(smoke_BehaviorTests, VersionTest, +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, VersionTest, ::testing::Combine( ::testing::Values(InferenceEngine::Precision::FP32), ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/hetero/query_network.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/hetero/query_network.cpp index b369e55adf225e..64b88ea714f88f 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/hetero/query_network.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/hetero/query_network.cpp @@ -13,7 +13,7 @@ using namespace HeteroTests; auto ConvBias = ngraph::builder::subgraph::makeConvBias(); -INSTANTIATE_TEST_CASE_P(smoke_FullySupportedTopologies, QueryNetworkTest, +INSTANTIATE_TEST_SUITE_P(smoke_FullySupportedTopologies, QueryNetworkTest, ::testing::Combine( ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE, "HETERO:TEMPLATE", "MULTI:TEMPLATE"), ::testing::Values(ConvBias)), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/hetero/synthetic.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/hetero/synthetic.cpp index 15f940780ddb64..164d959e408c08 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/hetero/synthetic.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/hetero/synthetic.cpp @@ -11,13 +11,13 @@ namespace { using namespace HeteroTests; -INSTANTIATE_TEST_CASE_P(smoke_SingleMajorNode, HeteroSyntheticTest, +INSTANTIATE_TEST_SUITE_P(smoke_SingleMajorNode, HeteroSyntheticTest, ::testing::Combine( ::testing::Values(std::vector{{"TEMPLATE0", "templatePlugin"}, {"TEMPLATE1", "templatePlugin"}}), ::testing::ValuesIn(HeteroTests::HeteroSyntheticTest::_singleMajorNodeFunctions)), HeteroSyntheticTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(nightly_RandomMajorNodes, HeteroSyntheticTest, +INSTANTIATE_TEST_SUITE_P(nightly_RandomMajorNodes, HeteroSyntheticTest, ::testing::Combine( ::testing::Values(std::vector{{"TEMPLATE0", "templatePlugin"}, {"TEMPLATE1", "templatePlugin"}}), ::testing::ValuesIn(HeteroTests::HeteroSyntheticTest::_randomMajorNodeFunctions)), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/convolution.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/convolution.cpp index 3f6d9c494066c7..c099cb3d5d4ffc 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/convolution.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/convolution.cpp @@ -57,7 +57,7 @@ const auto conv2DParams_AutoPadValid = ::testing::Combine( ); // ! [test_convolution:instantiate] -INSTANTIATE_TEST_CASE_P(Convolution2D_ExplicitPadding, ConvolutionLayerTest, +INSTANTIATE_TEST_SUITE_P(Convolution2D_ExplicitPadding, ConvolutionLayerTest, ::testing::Combine( conv2DParams_ExplicitPadding, ::testing::ValuesIn(netPrecisions), @@ -70,7 +70,7 @@ INSTANTIATE_TEST_CASE_P(Convolution2D_ExplicitPadding, ConvolutionLayerTest, ConvolutionLayerTest::getTestCaseName); // ! [test_convolution:instantiate] -INSTANTIATE_TEST_CASE_P(Convolution2D_AutoPadValid, ConvolutionLayerTest, +INSTANTIATE_TEST_SUITE_P(Convolution2D_AutoPadValid, ConvolutionLayerTest, ::testing::Combine( conv2DParams_AutoPadValid, ::testing::ValuesIn(netPrecisions), @@ -113,7 +113,7 @@ const auto conv3DParams_AutoPadValid = ::testing::Combine( ::testing::Values(ngraph::op::PadType::VALID) ); -INSTANTIATE_TEST_CASE_P(smoke_Convolution3D_ExplicitPadding, ConvolutionLayerTest, +INSTANTIATE_TEST_SUITE_P(smoke_Convolution3D_ExplicitPadding, ConvolutionLayerTest, ::testing::Combine( conv3DParams_ExplicitPadding, ::testing::ValuesIn(netPrecisions), @@ -125,7 +125,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Convolution3D_ExplicitPadding, ConvolutionLayerTes ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE)), ConvolutionLayerTest::getTestCaseName); -INSTANTIATE_TEST_CASE_P(nightly_Convolution3D_AutoPadValid, ConvolutionLayerTest, +INSTANTIATE_TEST_SUITE_P(nightly_Convolution3D_AutoPadValid, ConvolutionLayerTest, ::testing::Combine( conv3DParams_AutoPadValid, ::testing::ValuesIn(netPrecisions), diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp index 85313f410d3606..d95ecdd81c826f 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp @@ -2,43 +2,58 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "single_layer_tests/reshape.hpp" + #include -#include "single_layer_tests/reshape.hpp" #include "common_test_utils/test_constants.hpp" using namespace LayerTestsDefinitions; namespace { const std::vector netPrecisions = { - InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP32, }; -INSTANTIATE_TEST_CASE_P(smoke_ReshapeCheckDynBatch, ReshapeLayerTest, - ::testing::Combine( - ::testing::Values(true), - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({30, 30, 30, 30})), - ::testing::Values(std::vector({30, 30, 30, 30})), - ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), - ::testing::Values(std::map({}))), - ReshapeLayerTest::getTestCaseName); - -INSTANTIATE_TEST_CASE_P(smoke_ReshapeCheck, ReshapeLayerTest, - ::testing::Combine( - ::testing::Values(true), - ::testing::ValuesIn(netPrecisions), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(InferenceEngine::Layout::ANY), - ::testing::Values(std::vector({10, 10, 10, 10})), - ::testing::Values(std::vector({10, 0, 100})), - ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), - ::testing::Values(std::map({}))), - ReshapeLayerTest::getTestCaseName); -} // namespace \ No newline at end of file +INSTANTIATE_TEST_SUITE_P( + smoke_ReshapeCheckDynBatch, ReshapeLayerTestRevise, + ::testing::Combine( + ::testing::Values(true), ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(std::vector({30, 30, 30, 30})), + ::testing::Values(std::vector({30, 30, 30, 30})), + ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), + ::testing::Values(std::map({}))), + ReshapeLayerTestRevise::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_ReshapeCheck, ReshapeLayerTestRevise, + ::testing::Combine( + ::testing::Values(true), ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(std::vector({10, 10, 10, 10})), + ::testing::Values(std::vector({10, 0, 100})), + ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), + ::testing::Values(std::map({}))), + ReshapeLayerTestRevise::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P( + smoke_ReshapeCheckNegative, ReshapeLayerTestRevise, + ::testing::Combine( + ::testing::Values(true), ::testing::ValuesIn(netPrecisions), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(std::vector({10, 10, 10, 10})), + ::testing::Values(std::vector({10, -1, 100})), + ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE), + ::testing::Values(std::map({}))), + ReshapeLayerTestRevise::getTestCaseName); +} // namespace diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp index 3ad1b07c736e5e..583d2d7b81e727 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/softmax.cpp @@ -41,7 +41,7 @@ const auto params2D = testing::Combine( testing::Values(std::map()) ); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_SoftMax2D, SoftMaxLayerTest, params2D, @@ -68,7 +68,7 @@ const auto params4D = testing::Combine( testing::Values(std::map()) ); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( smoke_SoftMax4D, SoftMaxLayerTest, params4D, diff --git a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/split.cpp b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/split.cpp index 44f2c6737785b4..f4ea3db8af9525 100644 --- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/split.cpp +++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/split.cpp @@ -11,7 +11,7 @@ using namespace LayerTestsDefinitions; namespace { -INSTANTIATE_TEST_CASE_P(smoke_NumSplitsCheck, SplitLayerTest, +INSTANTIATE_TEST_SUITE_P(smoke_NumSplitsCheck, SplitLayerTest, ::testing::Combine( ::testing::Values(1, 2, 3, 5, 6, 10, 30), ::testing::Values(0, 1, 2, 3), diff --git a/docs/template_plugin/tests/functional/skip_tests_config.cpp b/docs/template_plugin/tests/functional/skip_tests_config.cpp index 252ed7c9a712c9..8d100118a9d248 100644 --- a/docs/template_plugin/tests/functional/skip_tests_config.cpp +++ b/docs/template_plugin/tests/functional/skip_tests_config.cpp @@ -10,10 +10,10 @@ std::vector disabledTestPatterns() { return { ".*ExclusiveAsyncRequests.*", - ".*reusableCPUStreamsExecutor.*", + ".*ReusableCPUStreamsExecutor.*", R"(.*SplitLayerTest.*numSplits\=30.*)", // CVS-51758 ".*PreprocessConversionTest.*oLT=NHWC.*", ".*PreprocessDynamicallyInSetBlobTest.*oPRC=0.*oLT=1.*", }; -} \ No newline at end of file +} diff --git a/inference-engine/CMakeLists.txt b/inference-engine/CMakeLists.txt index 1ac7fd8bf62b4a..a11aac8d8a24de 100644 --- a/inference-engine/CMakeLists.txt +++ b/inference-engine/CMakeLists.txt @@ -13,55 +13,28 @@ include(cmake/features.cmake) # resolving dependencies for the project include(cmake/dependencies.cmake) -function(ie_developer_export_targets) - openvino_developer_export_targets(COMPONENT inference_engine TARGETS ${ARGN}) -endfunction() - -function(ie_developer_export) - set(all_dev_targets gflags ie_libraries) - foreach(component IN LISTS openvino_export_components) - export(TARGETS ${${component}} NAMESPACE IE:: - APPEND FILE "${CMAKE_BINARY_DIR}/${component}_dev_targets.cmake") - list(APPEND all_dev_targets ${${component}}) - endforeach() - - add_custom_target(ie_dev_targets ALL DEPENDS ${all_dev_targets}) -endfunction() - add_subdirectory(thirdparty) - add_subdirectory(src) - add_subdirectory(ie_bridges/c) -if(ENABLE_TESTS) - add_subdirectory(tests_deprecated) - add_subdirectory(tests) +if(ENABLE_PYTHON) + add_subdirectory(ie_bridges/python) endif() add_subdirectory(tools) +add_subdirectory(samples) -function(ie_build_samples) - # samples should be build with the same flags as from OpenVINO package, - # so unset all flags - foreach(var CMAKE_CXX_FLAGS CMAKE_C_FLAGS CMAKE_CXX_STANDARD - CMAKE_EXE_LINKER_FLAGS CMAKE_POLICY_DEFAULT_CMP0063 - CMAKE_CXX_VISIBILITY_PRESET CMAKE_C_VISIBILITY_PRESET - CMAKE_VISIBILITY_INLINES_HIDDEN CMAKE_POSITION_INDEPENDENT_CODE - THREADS_PREFER_PTHREAD_FLAG X86_64 X86 ARM AARCH64 LINUX - MINGW64 CMAKE_BUILD_TYPE CMAKE_MACOSX_RPATH) - unset(${var}) - endforeach() - include("${IEDevScripts_DIR}/compile_flags/sanitizer.cmake") - add_subdirectory(samples) -endfunction() +if(ENABLE_TESTS) + add_subdirectory(tests_deprecated) + add_subdirectory(tests) +endif() -# gflags and format_reader targets are kept inside of samples directory and -# they must be built even if samples build is disabled (required for tests and tools). -ie_build_samples() +# +# Coverage +# -if(ENABLE_PYTHON) - add_subdirectory(ie_bridges/python) +if(ENABLE_COVERAGE) + include(cmake/coverage.cmake) endif() # @@ -70,19 +43,7 @@ endif() # install C++ samples -ie_cpack_add_component(cpp_samples DEPENDS core) - -install(DIRECTORY ../thirdparty/zlib - DESTINATION ${IE_CPACK_IE_DIR}/samples/cpp/thirdparty - COMPONENT cpp_samples - USE_SOURCE_PERMISSIONS - PATTERN .clang-format EXCLUDE) - -install(DIRECTORY ../thirdparty/cnpy - DESTINATION ${IE_CPACK_IE_DIR}/samples/cpp/thirdparty - COMPONENT cpp_samples - USE_SOURCE_PERMISSIONS - PATTERN .clang-format EXCLUDE) +ie_cpack_add_component(cpp_samples DEPENDS cpp_samples_deps core) if(UNIX) install(DIRECTORY samples/ @@ -142,7 +103,7 @@ endif() # Developer package # -openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils) +openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader ie_samples_utils) # for Template plugin if(NGRAPH_INTERPRETER_ENABLE) @@ -153,36 +114,34 @@ function(ie_generate_dev_package_config) # dummy check that OpenCV is here find_package(OpenCV QUIET) - ie_developer_export() + set(all_dev_targets gflags ie_libraries) + foreach(component IN LISTS openvino_export_components) + export(TARGETS ${${component}} NAMESPACE IE:: + APPEND FILE "${CMAKE_BINARY_DIR}/${component}_dev_targets.cmake") + list(APPEND all_dev_targets ${${component}}) + endforeach() + add_custom_target(ie_dev_targets ALL DEPENDS ${all_dev_targets}) configure_package_config_file("${InferenceEngine_SOURCE_DIR}/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in" - "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig.cmake" - INSTALL_DESTINATION share # not used - PATH_VARS "OpenVINO_MAIN_SOURCE_DIR;IE_MAIN_SOURCE_DIR;gflags_BINARY_DIR" - NO_CHECK_REQUIRED_COMPONENTS_MACRO) + "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig.cmake" + INSTALL_DESTINATION share # not used + PATH_VARS "OpenVINO_MAIN_SOURCE_DIR;IE_MAIN_SOURCE_DIR" + NO_CHECK_REQUIRED_COMPONENTS_MACRO) configure_file("${IE_MAIN_SOURCE_DIR}/cmake/templates/InferenceEngineConfig-version.cmake.in" - "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig-version.cmake" - @ONLY) + "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig-version.cmake" + @ONLY) endfunction() ie_generate_dev_package_config() -# -# Coverage -# - -if(ENABLE_COVERAGE) - include(cmake/coverage.cmake) -endif() - # # Add extra modules # function(register_extra_modules) # post export - ie_developer_export_targets(inference_engine) + openvino_developer_export_targets(COMPONENT inference_engine TARGETS inference_engine) openvino_developer_export_targets(COMPONENT ngraph TARGETS ${NGRAPH_LIBRARIES}) set(InferenceEngineDeveloperPackage_DIR "${CMAKE_CURRENT_BINARY_DIR}/build-modules") diff --git a/inference-engine/cmake/dependencies.cmake b/inference-engine/cmake/dependencies.cmake index 4ce1ef3136550e..b270c46f2da7cc 100644 --- a/inference-engine/cmake/dependencies.cmake +++ b/inference-engine/cmake/dependencies.cmake @@ -295,25 +295,25 @@ if (ENABLE_SPEECH_DEMO) if(DEFINED IE_PATH_TO_DEPS) if (WIN32 AND X86_64) RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS - ARCHIVE_WIN "speech_demo_1.0.0.755_windows.zip" + ARCHIVE_WIN "speech_demo_1.0.0.774_windows.zip" VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*" - TARGET_PATH "${TEMP}/speech_demo_1.0.0.755" - SHA256 "58adef14b8a749f70fa83888614cee34b941956e6e958e445e3f48885b3c20a0") + TARGET_PATH "${TEMP}/speech_demo_1.0.0.774" + SHA256 "67b25170be5e89a4f0e90e8b39623b60c9a15b965c30329385e295fcd2edc856") debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS}) elseif (LINUX AND X86_64) if (LINUX_OS_NAME STREQUAL "CentOS 7" OR CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9") RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS - ARCHIVE_LIN "speech_demo_1.0.0.755_centos.tgz" + ARCHIVE_LIN "speech_demo_1.0.0.774_centos.tgz" VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*" - TARGET_PATH "${TEMP}/speech_demo_1.0.0.755" - SHA256 "716201e377714ac50f3909c445d36d47a089de50a557d8ef65232de040671188") + TARGET_PATH "${TEMP}/speech_demo_1.0.0.774" + SHA256 "5ec3b7be9ae05376aefae5bd5fd4a39b12c274e82817fd3218120b8e8fc8ff5a") debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS}) else() RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS - ARCHIVE_LIN "speech_demo_1.0.0.755_linux.tgz" + ARCHIVE_LIN "speech_demo_1.0.0.774_linux.tgz" VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*" - TARGET_PATH "${TEMP}/speech_demo_1.0.0.755" - SHA256 "7714b8776ec0183ed73eed6d3d965ee6d5c15d2dc49ee5ae118cc368c89c7a9d") + TARGET_PATH "${TEMP}/speech_demo_1.0.0.774" + SHA256 "f0bbd0a6218b0365e7cfb1f860b34e4ace7e0d47dd60b369cdea8a480329810f") debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS}) endif() else() diff --git a/inference-engine/cmake/templates/InferenceEngineConfig.cmake.in b/inference-engine/cmake/templates/InferenceEngineConfig.cmake.in index cb4c62442c5ed6..261edbf3d730f3 100644 --- a/inference-engine/cmake/templates/InferenceEngineConfig.cmake.in +++ b/inference-engine/cmake/templates/InferenceEngineConfig.cmake.in @@ -29,6 +29,11 @@ # Common functions # +if(NOT DEFINED CMAKE_FIND_PACKAGE_NAME) + set(CMAKE_FIND_PACKAGE_NAME InferenceEngine) + set(_need_package_name_reset ON) +endif() + # we have to use our own version of find_dependency because of support cmake 3.7 macro(_ie_find_dependency dep) set(cmake_fd_quiet_arg) @@ -138,3 +143,8 @@ unset(IE_PACKAGE_PREFIX_DIR) set_and_check(InferenceEngine_INCLUDE_DIRS "@PACKAGE_IE_INCLUDE_DIR@") check_required_components(${CMAKE_FIND_PACKAGE_NAME}) + +if(_need_package_name_reset) + unset(CMAKE_FIND_PACKAGE_NAME) + unset(_need_package_name_reset) +endif() diff --git a/inference-engine/ie_bridges/python/CMakeLists.txt b/inference-engine/ie_bridges/python/CMakeLists.txt index 5c80af7f574f72..b8216b0cb3435a 100644 --- a/inference-engine/ie_bridges/python/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/CMakeLists.txt @@ -68,6 +68,10 @@ if(ENABLE_WHEEL) add_subdirectory(wheel) endif() +if (NGRAPH_PYTHON_BUILD_ENABLE) + add_dependencies(ie_api _pyngraph) +endif() + # install ie_cpack_add_component(${PYTHON_VERSION}) diff --git a/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md b/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md index c33d67103c6954..f0701f963aea20 100644 --- a/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md +++ b/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md @@ -1,7 +1,8 @@ # nGraph Function Creation Python* Sample {#openvino_inference_engine_ie_bridges_python_sample_ngraph_function_creation_sample_README} -This sample demonstrates how to execute an inference using [nGraph function feature](../../../../../docs/nGraph_DG/build_function.md) to create a network that uses weights from LeNet classification network. So you don't need an XML file, the model will be created from the source code on the fly. -In addition to regular images, the sample also supports single-channel ubyte images as an input. +This sample demonstrates how to execute an inference using [nGraph function feature](../../../../../docs/nGraph_DG/build_function.md) to create a network that uses weights from LeNet classification network, which is known to work well on digit classification tasks. So you don't need an XML file, the model will be created from the source code on the fly. + +In addition to regular grayscale images with a digit, the sample also supports single-channel `ubyte` images as an input. The following Inference Engine Python API is used in the application: @@ -14,6 +15,9 @@ Basic Inference Engine API is covered by [Hello Classification Python* Sample](. | Options | Values | | :------------------------- | :---------------------------------------------------------------------- | +| Validated Models | LeNet (image classification network) | +| Model Format | Network weights file (\*.bin) | +| Validated images | The sample uses OpenCV\* to [read input grayscale image](https://docs.opencv.org/master/d4/da8/group__imgcodecs.html#ga288b8b3da0892bd651fce07b3bbd3a56) (\*.bmp, \*.png) or single-channel `ubyte` image | | Supported devices | [All](../../../../../docs/IE_DG/supported_plugins/Supported_Devices.md) | | Other language realization | [C++](../../../../samples/ngraph_function_creation_sample) | @@ -72,7 +76,7 @@ To run the sample, you need specify a model weights and image: You can do inference of an image using a pre-trained model on a GPU using the following command: ```sh -python ngraph_function_creation_sample.py -m /lenet.bin -i /3.bmp -d GPU +python ngraph_function_creation_sample.py -m /lenet.bin -i /3.png -d GPU ``` ## Sample Output @@ -84,10 +88,10 @@ The sample application logs each step in a standard output stream and outputs to [ INFO ] Loading the network using ngraph function with weights from /lenet.bin [ INFO ] Configuring input and output blobs [ INFO ] Loading the model to the plugin -[ WARNING ] /3.bmp is inverted to white over black -[ WARNING ] /3.bmp is resized from (100, 100) to (28, 28) +[ WARNING ] /3.png is inverted to white over black +[ WARNING ] /3.png is is resized from (351, 353) to (28, 28) [ INFO ] Starting inference in synchronous mode -[ INFO ] Image path: /3.bmp +[ INFO ] Image path: /3.png [ INFO ] Top 10 results: [ INFO ] classid probability [ INFO ] ------------------- diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt index 1f623fb48336c0..17b8bf5b9b513f 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt @@ -77,4 +77,5 @@ install(PROGRAMS __init__.py DESTINATION ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION}/openvino/inference_engine COMPONENT ${PYTHON_VERSION}) -add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}) \ No newline at end of file +add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME} + EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx") diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd index 5d942f93050246..efb389259d36f8 100644 --- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd +++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from .cimport ie_api_impl_defs as C -from .ie_api_impl_defs cimport CBlob, CTensorDesc, InputInfo, CPreProcessChannel, CPreProcessInfo, CExecutableNetwork +from .ie_api_impl_defs cimport CBlob, CTensorDesc, InputInfo, CPreProcessChannel, CPreProcessInfo, CExecutableNetwork, CVariableState import os diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt index c6315336ba2929..27c9e7bf898257 100644 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt @@ -42,7 +42,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options(${TARGET_NAME} PRIVATE "-Wno-error=register") endif() -add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}) +add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME} + EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx") # perform copy add_custom_command(TARGET ${TARGET_NAME} diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx index bd101280fcbb16..266c1dc94d9475 100644 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx +++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx @@ -17,8 +17,8 @@ def ApplyPOTTransformations(IENetwork network, string device): C.ApplyPOTTransformations(network.impl, device) -def ApplyLowLatencyTransformation(IENetwork network, int64_t num_iterations=1): - C.ApplyLowLatencyTransformation(network.impl, num_iterations) +def ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer = True): + C.ApplyLowLatencyTransformation(network.impl, use_const_initializer) def ApplyPruningTransformation(IENetwork network): diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp index b9ff879da8c843..183deaccfb388f 100644 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp +++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp @@ -26,16 +26,9 @@ void InferenceEnginePython::ApplyPOTTransformations(InferenceEnginePython::IENet manager.run_passes(network.actual->getFunction()); } -void InferenceEnginePython::ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, int64_t num_iterations) { +void InferenceEnginePython::ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, bool use_const_initializer) { ngraph::pass::Manager manager; - // TODO: pass num_iterations to LowLatency - manager.register_pass(); - manager.register_pass(); - - auto pass_config = manager.get_pass_config(); - pass_config->set_callback([](const std::shared_ptr& node) -> bool { - return node->get_rt_info().count("UNROLL_TI") == 0; - }); + manager.register_pass(use_const_initializer); manager.run_passes(network.actual->getFunction()); } diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp index 504388e4afc1ad..3941c48a50cfaa 100644 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp +++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp @@ -15,7 +15,7 @@ void ApplyMOCTransformations(InferenceEnginePython::IENetwork network, bool cf); void ApplyPOTTransformations(InferenceEnginePython::IENetwork network, std::string device); -void ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, int64_t num_iterations); +void ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, bool use_const_initializer = true); void ApplyPruningTransformation(InferenceEnginePython::IENetwork network); diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd index 726880e9353f37..551e56c27a8da0 100644 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd +++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd @@ -3,7 +3,6 @@ from libcpp cimport bool from libcpp.string cimport string -from libc.stdint cimport int64_t from ..inference_engine.ie_api_impl_defs cimport IENetwork @@ -12,10 +11,10 @@ cdef extern from "offline_transformations_api_impl.hpp" namespace "InferenceEngi cdef void ApplyPOTTransformations(IENetwork network, string device) - cdef void ApplyLowLatencyTransformation(IENetwork network, int64_t num_iterations) + cdef void ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer) cdef void ApplyPruningTransformation(IENetwork network) cdef void GenerateMappingFile(IENetwork network, string path, bool extract_names) - cdef void CheckAPI() \ No newline at end of file + cdef void CheckAPI() diff --git a/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt index 504125d9823c1a..8367f941d9f793 100644 --- a/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt @@ -48,4 +48,5 @@ add_custom_command(TARGET ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/test_utils/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/__init__.py ) -add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}) \ No newline at end of file +add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME} + EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx") \ No newline at end of file diff --git a/inference-engine/ie_bridges/python/tests/test_InferRequest.py b/inference-engine/ie_bridges/python/tests/test_InferRequest.py index 6928944139d722..af79c0ff155bf8 100644 --- a/inference-engine/ie_bridges/python/tests/test_InferRequest.py +++ b/inference-engine/ie_bridges/python/tests/test_InferRequest.py @@ -16,6 +16,20 @@ path_to_img = image_path() +def create_function_with_memory(input_shape, data_type): + import ngraph as ng + from ngraph.impl import Function, Type + + input_data = ng.parameter(input_shape, name="input_data", dtype=data_type) + rv = ng.read_value(input_data, "var_id_667") + add = ng.add(rv, input_data, name="MemoryAdd") + node = ng.assign(add, "var_id_667") + res = ng.result(add, "res") + func = Function(results=[res], sinks=[node], parameters=[input_data], name="name") + caps = Function.to_capsule(func) + return caps + + def read_image(): import cv2 n, c, h, w = (1, 3, 32, 32) @@ -525,28 +539,56 @@ def test_resize_algorithm_work(device): assert np.allclose(res_1, res_2, atol=1e-2, rtol=1e-2) -# issue 56653 -@pytest.mark.skip(reason="Test will enable when nGraph Python API allows to create network with memory") -def test_query_state(device): - import ngraph as ng - from ngraph.impl import Function - input_data = ng.parameter([5, 7], name="input_data", dtype=np.float32) - rv = ng.read_value(input_data, "var_id_667") - #a = ng.add(rv, input_data) - node = ng.assign(rv, "var_id_667") - res = ng.result(rv, "res") - func = Function([res], sinks=[node], parameters=[input_data], name='test') - caps = Function.to_capsule(func) +@pytest.mark.parametrize("mode", ["set_init_memory_state", "reset_memory_state", "normal"]) +@pytest.mark.parametrize("data_type", ["FP32", "FP16", "I32"]) +@pytest.mark.parametrize("input_shape", [[10], [10, 10], [10, 10, 10], [2, 10, 10, 10]]) +@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU", + reason=f"Can't run test on device {os.environ.get('TEST_DEVICE', 'CPU')}, " + "Memory layers fully supported only on CPU") +def test_query_state_write_buffer(device, input_shape, data_type, mode): + ie_core = ie.IECore() + if device == "CPU": + if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON": + pytest.skip("Can't run on ARM plugin") - net = ie.IENetwork(caps) + layout = ["C", "HW", "CHW", "NCHW"] + np_data_type = {"FP32": np.float32, "FP16": np.float16, "I32": np.int32} + + from openvino.inference_engine import TensorDesc, Blob + + net = ie.IENetwork(create_function_with_memory(input_shape, np_data_type[data_type])) ie_core = ie.IECore() exec_net = ie_core.load_network(network=net, device_name=device, num_requests=1) request = exec_net.requests[0] mem_states = request.query_state() mem_state = mem_states[0] - with pytest.raises(ValueError) as e: - ones_arr = np.ones(shape=(1, 800), dtype=np.float32) - mem_state.state.buffer[:] = ones_arr - assert "assignment destination is read-only" in str(e.value) - assert mem_state.name == 'id_1' - assert mem_state.state.tensor_desc.precision == 'FP32' + + assert mem_state.name == 'var_id_667' + # todo: Uncomment after fix 45611, + # CPU plugin returns outputs and memory state in FP32 in case of FP16 original precision + #assert mem_state.state.tensor_desc.precision == data_type + + for i in range(1, 10): + if mode == "set_init_memory_state": + # create initial value + const_init = 5 + init_array = np.full(input_shape, const_init, dtype=np_data_type[mem_state.state.tensor_desc.precision]) + tensor_desc = TensorDesc(mem_state.state.tensor_desc.precision, input_shape, layout[len(input_shape) - 1]) + blob = Blob(tensor_desc, init_array) + mem_state.state = blob + + res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])}) + expected_res = np.full(input_shape, 1 + const_init, dtype=np_data_type[data_type]) + elif mode == "reset_memory_state": + # reset initial state of ReadValue to zero + mem_state.reset() + res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])}) + + # always ones + expected_res = np.full(input_shape, 1, dtype=np_data_type[data_type]) + else: + res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])}) + expected_res = np.full(input_shape, i, dtype=np_data_type[data_type]) + + assert np.allclose(res['MemoryAdd'], expected_res, atol=1e-6), \ + "Expected values: {} \n Actual values: {} \n".format(expected_res, res) diff --git a/inference-engine/ie_bridges/python/tests/test_offline_api.py b/inference-engine/ie_bridges/python/tests/test_offline_api.py index b5565c04bb4d66..0bba0951c27c87 100644 --- a/inference-engine/ie_bridges/python/tests/test_offline_api.py +++ b/inference-engine/ie_bridges/python/tests/test_offline_api.py @@ -49,4 +49,4 @@ def test_pruning_transformations(): f = ng.function_from_cnn(net) assert f != None - assert len(f.get_ops()) == 3 \ No newline at end of file + assert len(f.get_ops()) == 3 diff --git a/inference-engine/include/cldnn/cldnn_config.hpp b/inference-engine/include/cldnn/cldnn_config.hpp index cbc2aef0242101..3e5dc4cfb126f6 100644 --- a/inference-engine/include/cldnn/cldnn_config.hpp +++ b/inference-engine/include/cldnn/cldnn_config.hpp @@ -11,47 +11,11 @@ #pragma once #include "ie_plugin_config.hpp" +#include "ie_api.h" +#include "gpu/gpu_config.hpp" namespace InferenceEngine { -namespace Metrics { - -/** - * @def GPU_METRIC_KEY(name) - * @brief shortcut for defining GPU plugin metrics - */ -#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name) -#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__) - -/** - * @def DECLARE_GPU_METRIC_VALUE(name) - * @brief shortcut for defining gpu metric values - */ -#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name) - -/** - * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size - */ -DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t); - -/** - * @brief Metric to get microarchitecture identifier in major.minor.revision format - */ -DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string); - -/** - * @brief Metric to get count of execution units for current GPU - */ -DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int); - -/** - * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric - * - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication - */ -DECLARE_GPU_METRIC_VALUE(HW_MATMUL); - -} // namespace Metrics - /** * @brief GPU plugin configuration */ @@ -70,6 +34,7 @@ namespace CLDNNConfigParams { * this option should be used with an unsigned integer value (1 is lowest priority) * 0 means no priority hint is set and default queue is created. */ +INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_PRIORITY instead") DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY); /** @@ -78,22 +43,26 @@ DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY); * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption) * 0 means no throttle hint is set and default queue created. */ +INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_THROTTLE instead") DECLARE_CLDNN_CONFIG_KEY(PLUGIN_THROTTLE); /** * @brief This key controls clDNN memory pool optimization. * Turned off by default. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CLDNN_CONFIG_KEY(MEM_POOL); /** * @brief This key defines the directory name to which clDNN graph visualization will be dumped. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CLDNN_CONFIG_KEY(GRAPH_DUMPS_DIR); /** * @brief This key defines the directory name to which full program sources will be dumped. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CLDNN_CONFIG_KEY(SOURCES_DUMPS_DIR); /** @@ -108,43 +77,19 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS); * @brief This key should be set to correctly handle NV12 input without pre-processing. * Turned off by default. */ +INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_NV12_TWO_INPUTS instead") DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS); -/** - * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading. - * Default value is maximum number of threads available in the environment. - */ -DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS); - -/** - * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count. - * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). - * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). - * Note that turning this key on will increase the graph loading time in proportion to the iteration counts. - * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/ -DECLARE_CLDNN_CONFIG_KEY(ENABLE_LOOP_UNROLLING); - } // namespace CLDNNConfigParams namespace PluginConfigParams { -/** - * @brief Optimize GPU plugin execution to maximize throughput. - * - * It is passed to Core::SetConfig(), this option should be used with values: - * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases, - * this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance - * drop - * - a positive integer value creates the requested number of streams - */ -DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO); -DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS); - /** * @brief This key enables dumping of the kernels used by the plugin for custom layers. * * This option should be used with values: PluginConfigParams::YES or PluginConfigParams::NO (default) */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CONFIG_KEY(DUMP_KERNELS); /** @@ -159,17 +104,24 @@ DECLARE_CONFIG_KEY(DUMP_KERNELS); * * For values TUNING_CREATE and TUNING_RETUNE the file will be created if it does not exist. */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CONFIG_KEY(TUNING_MODE); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_CREATE); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_USE_EXISTING); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_DISABLED); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_UPDATE); +INFERENCE_ENGINE_DEPRECATED("The config value will be removed") DECLARE_CONFIG_VALUE(TUNING_RETUNE); /** * @brief This key defines the tuning data filename to be created/used */ +INFERENCE_ENGINE_DEPRECATED("The config key will be removed") DECLARE_CONFIG_KEY(TUNING_FILE); } // namespace PluginConfigParams diff --git a/inference-engine/include/cpp/ie_executable_network.hpp b/inference-engine/include/cpp/ie_executable_network.hpp index eb1824f9da041e..81d5b10e7dd4be 100644 --- a/inference-engine/include/cpp/ie_executable_network.hpp +++ b/inference-engine/include/cpp/ie_executable_network.hpp @@ -32,9 +32,6 @@ class IExecutableNetworkInternal; class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) { details::SharedObjectLoader _so; std::shared_ptr _impl; - IE_SUPPRESS_DEPRECATED_START - std::shared_ptr actual; - IE_SUPPRESS_DEPRECATED_END /** * @brief Constructs ExecutableNetwork from the initialized std::shared_ptr @@ -51,18 +48,6 @@ class INFERENCE_ENGINE_API_CLASS(ExecutableNetwork) { */ ExecutableNetwork() = default; - IE_SUPPRESS_DEPRECATED_START - /** - * @deprecated This ctor will be removed in 2022.1 - * @brief Constructs ExecutableNetwork from the initialized std::shared_ptr - * @param exec Initialized shared pointer - * @param splg Plugin to use. This is required to ensure that ExecutableNetwork can work properly even if plugin object is destroyed. - */ - INFERENCE_ENGINE_DEPRECATED("This ctor will be removed in 2022.1") - explicit ExecutableNetwork(std::shared_ptr exec, - std::shared_ptr splg = {}); - IE_SUPPRESS_DEPRECATED_END - /** * @brief Gets the Executable network output Data node information. * diff --git a/inference-engine/include/cpp/ie_infer_request.hpp b/inference-engine/include/cpp/ie_infer_request.hpp index fd71bf18bc2cc1..c5d52ec6fc0905 100644 --- a/inference-engine/include/cpp/ie_infer_request.hpp +++ b/inference-engine/include/cpp/ie_infer_request.hpp @@ -35,10 +35,6 @@ class ICompletionCallbackWrapper; class INFERENCE_ENGINE_API_CLASS(InferRequest) { details::SharedObjectLoader _so; std::shared_ptr _impl; - IE_SUPPRESS_DEPRECATED_START - IInferRequest::Ptr actual; - std::shared_ptr callback; - IE_SUPPRESS_DEPRECATED_END /** * @brief Constructs InferRequest from the initialized std::shared_ptr @@ -71,18 +67,6 @@ class INFERENCE_ENGINE_API_CLASS(InferRequest) { */ InferRequest() = default; - IE_SUPPRESS_DEPRECATED_START - /** - * @deprecated This ctor will be removed in 2022.1 - * @brief Constructs InferRequest from the initialized std::shared_ptr - * @param request Initialized shared pointer - * @param splg Plugin to use. This is required to ensure that InferRequest can work properly even if plugin object is destroyed. - */ - INFERENCE_ENGINE_DEPRECATED("This ctor will be removed in 2022.1") - explicit InferRequest(IInferRequest::Ptr request, - std::shared_ptr splg = {}); - IE_SUPPRESS_DEPRECATED_END - /** * @brief Sets input/output data to infer * diff --git a/inference-engine/include/cpp/ie_memory_state.hpp b/inference-engine/include/cpp/ie_memory_state.hpp index 0c055cec40cc79..8d54f79f06ce6a 100644 --- a/inference-engine/include/cpp/ie_memory_state.hpp +++ b/inference-engine/include/cpp/ie_memory_state.hpp @@ -3,7 +3,7 @@ // /** - * @brief A header file that provides wrapper classes for IVariableState + * @brief A header file that provides VariableState * * @file ie_memory_state.hpp */ @@ -16,21 +16,17 @@ #include "ie_api.h" #include "ie_blob.h" #include "details/ie_so_loader.h" -#include "ie_imemory_state.hpp" namespace InferenceEngine { class IVariableStateInternal; /** - * @brief C++ exception based error reporting wrapper of API class IVariableState + * @brief VariableState class */ class INFERENCE_ENGINE_API_CLASS(VariableState) { details::SharedObjectLoader _so; std::shared_ptr _impl; - IE_SUPPRESS_DEPRECATED_START - std::shared_ptr actual; - IE_SUPPRESS_DEPRECATED_END /** * @brief Constructs VariableState from the initialized std::shared_ptr @@ -48,55 +44,27 @@ class INFERENCE_ENGINE_API_CLASS(VariableState) { */ VariableState() = default; - IE_SUPPRESS_DEPRECATED_START /** - * @deprecated This ctor will be removed in 2022.1 - * @brief constructs VariableState from the initialized std::shared_ptr - * @param pState Initialized shared pointer - * @param plg Optional: Plugin to use. This is required to ensure that VariableState can work properly even if plugin object is destroyed. - */ - INFERENCE_ENGINE_DEPRECATED("This ctor will be removed in 2022.1") - explicit VariableState(std::shared_ptr pState, - std::shared_ptr plg = {}); - IE_SUPPRESS_DEPRECATED_END - - /** - * @copybrief IVariableState::Reset - * - * Wraps IVariableState::Reset + * @brief Reset internal variable state for relevant infer request, + * to a value specified as default for according ReadValue node */ void Reset(); /** - * @copybrief IVariableState::GetName - * - * Wraps IVariableState::GetName + * @brief Gets name of current variable state, if length of array is not enough name is truncated by len, null + * terminator is inserted as well. As variable state name `variable_id` from according `ReadValue` used. * @return A string representing a state name */ std::string GetName() const; /** - * @copybrief IVariableState::GetState - * - * Wraps IVariableState::GetState + * @brief Returns the value of the variable state. * @return A blob representing a state */ Blob::CPtr GetState() const; /** - * @copybrief IVariableState::GetLastState - * @deprecated Use IVariableState::SetState instead - * - * Wraps IVariableState::GetLastState - * @return A blob representing a last state - */ - INFERENCE_ENGINE_DEPRECATED("Use VariableState::GetState function instead") - Blob::CPtr GetLastState() const; - - /** - * @copybrief IVariableState::SetState - * - * Wraps IVariableState::SetState + * @brief Sets the new state for the next inference. * @param state The current state to set */ void SetState(Blob::Ptr state); diff --git a/inference-engine/include/gna/gna_config.hpp b/inference-engine/include/gna/gna_config.hpp index 958227696a1c98..3433ab58887f7a 100644 --- a/inference-engine/include/gna/gna_config.hpp +++ b/inference-engine/include/gna/gna_config.hpp @@ -65,6 +65,7 @@ DECLARE_GNA_CONFIG_KEY(DEVICE_MODE); DECLARE_GNA_CONFIG_VALUE(AUTO); DECLARE_GNA_CONFIG_VALUE(HW); +DECLARE_GNA_CONFIG_VALUE(HW_WITH_SW_FBACK); DECLARE_GNA_CONFIG_VALUE(SW); DECLARE_GNA_CONFIG_VALUE(SW_EXACT); DECLARE_GNA_CONFIG_VALUE(SW_FP32); diff --git a/inference-engine/include/gpu/gpu_config.hpp b/inference-engine/include/gpu/gpu_config.hpp new file mode 100644 index 00000000000000..96f8754ac8660a --- /dev/null +++ b/inference-engine/include/gpu/gpu_config.hpp @@ -0,0 +1,120 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief A header for advanced hardware related properties for GPU plugin + * To use in SetConfig() method of plugins + * + * @file gpu_config.hpp + */ +#pragma once + +#include "ie_plugin_config.hpp" + +namespace InferenceEngine { + +namespace Metrics { + +/** + * @def GPU_METRIC_KEY(name) + * @brief shortcut for defining GPU plugin metrics + */ +#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name) +#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__) + +/** + * @def DECLARE_GPU_METRIC_VALUE(name) + * @brief shortcut for defining gpu metric values + */ +#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name) + +/** + * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size + */ +DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t); + +/** + * @brief Metric to get microarchitecture identifier in major.minor.revision format + */ +DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string); + +/** + * @brief Metric to get count of execution units for current GPU + */ +DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int); + +/** + * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric + * - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication + */ +DECLARE_GPU_METRIC_VALUE(HW_MATMUL); + +} // namespace Metrics + +/** + * @brief GPU plugin configuration + */ +namespace GPUConfigParams { + +/** + * @brief shortcut for defining configuration keys + */ +#define GPU_CONFIG_KEY(name) InferenceEngine::GPUConfigParams::_CONFIG_KEY(GPU_##name) +#define DECLARE_GPU_CONFIG_KEY(name) DECLARE_CONFIG_KEY(GPU_##name) +#define DECLARE_GPU_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(GPU_##name) + +/** + * @brief This key instructs the GPU plugin to use the OpenCL queue priority hint + * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf + * this option should be used with an unsigned integer value (1 is lowest priority) + * 0 means no priority hint is set and default queue is created. + */ +DECLARE_GPU_CONFIG_KEY(PLUGIN_PRIORITY); + +/** + * @brief This key instructs the GPU plugin to use throttle hints the OpenCL queue throttle hint + * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf, + * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption) + * 0 means no throttle hint is set and default queue created. + */ +DECLARE_GPU_CONFIG_KEY(PLUGIN_THROTTLE); + +/** + * @brief This key should be set to correctly handle NV12 input without pre-processing. + * Turned off by default. + */ +DECLARE_GPU_CONFIG_KEY(NV12_TWO_INPUTS); + +/** + * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading. + * Default value is maximum number of threads available in the environment. + */ +DECLARE_GPU_CONFIG_KEY(MAX_NUM_THREADS); + +/** + * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count. + * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). + * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). + * Note that turning this key on will increase the graph loading time in proportion to the iteration counts. + * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/ +DECLARE_GPU_CONFIG_KEY(ENABLE_LOOP_UNROLLING); + +} // namespace GPUConfigParams + +namespace PluginConfigParams { + +/** + * @brief Optimize GPU plugin execution to maximize throughput. + * + * It is passed to Core::SetConfig(), this option should be used with values: + * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases, + * this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance + * drop + * - a positive integer value creates the requested number of streams + */ +DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO); +DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS); +} // namespace PluginConfigParams + +} // namespace InferenceEngine diff --git a/inference-engine/include/ie_core.hpp b/inference-engine/include/ie_core.hpp index e87f8c65719085..96f8d6b58af0e0 100644 --- a/inference-engine/include/ie_core.hpp +++ b/inference-engine/include/ie_core.hpp @@ -174,9 +174,18 @@ class INFERENCE_ENGINE_API_CLASS(Core) { * operation* * @return An executable network reference */ - ExecutableNetwork ImportNetwork(std::istream& networkModel, const std::string& deviceName = {}, + ExecutableNetwork ImportNetwork(std::istream& networkModel, const std::string& deviceName, const std::map& config = {}); + /** + * @deprecated Use Core::ImportNetwork with explicit device name + * @brief Creates an executable network from a previously exported network + * @param networkModel network model stream + * @return An executable network reference + */ + INFERENCE_ENGINE_DEPRECATED("Use Core::ImportNetwork with explicit device name") + ExecutableNetwork ImportNetwork(std::istream& networkModel); + /** * @brief Creates an executable network from a previously exported network within a specified * remote context. diff --git a/inference-engine/include/ie_iexecutable_network.hpp b/inference-engine/include/ie_iexecutable_network.hpp index caef9bb95b95bd..bb0a6f71c4ac3f 100644 --- a/inference-engine/include/ie_iexecutable_network.hpp +++ b/inference-engine/include/ie_iexecutable_network.hpp @@ -18,7 +18,6 @@ #include "ie_common.h" #include "ie_icnn_network.hpp" #include "ie_iinfer_request.hpp" -#include "ie_imemory_state.hpp" #include "ie_input_info.hpp" #include "ie_parameter.hpp" #include "ie_remote_context.hpp" @@ -113,22 +112,6 @@ class INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::ExecutableNetwork instea INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::ExecutableNetwork::GetExecGraphInfo instead") virtual StatusCode GetExecGraphInfo(ICNNNetwork::Ptr& graphPtr, ResponseDesc* resp) noexcept = 0; - /** - * @deprecated Use InferRequest::QueryState instead - * @brief Gets state control interface for given executable network. - * - * State control essential for recurrent networks - * - * @param pState reference to a pointer that receives internal states - * @param idx requested index for receiving memory state - * @param resp Optional: pointer to an already allocated object to contain information in case of failure - * @return Status code of the operation: InferenceEngine::OK (0) for success, OUT_OF_BOUNDS (-6) no memory state for - * given index - */ - INFERENCE_ENGINE_DEPRECATED("Use InferRequest::QueryState instead") - virtual StatusCode QueryState(IVariableState::Ptr& pState, size_t idx, ResponseDesc* resp) noexcept = 0; - IE_SUPPRESS_DEPRECATED_END - /** * @brief Sets configuration for current executable network * diff --git a/inference-engine/include/ie_iinfer_request.hpp b/inference-engine/include/ie_iinfer_request.hpp index 7d762d96a11305..4fd200c0252062 100644 --- a/inference-engine/include/ie_iinfer_request.hpp +++ b/inference-engine/include/ie_iinfer_request.hpp @@ -17,7 +17,6 @@ #include "ie_blob.h" #include "ie_common.h" #include "ie_preprocess.hpp" -#include "ie_imemory_state.hpp" namespace InferenceEngine { @@ -195,21 +194,6 @@ class INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::InferRequest C++ wrapper */ virtual InferenceEngine::StatusCode SetBatch(int batch_size, ResponseDesc* resp) noexcept = 0; - IE_SUPPRESS_DEPRECATED_START - /** - * @brief Gets state control interface for given infer request. - * - * State control essential for recurrent networks - * - * @param pState reference to a pointer that receives internal states - * @param idx requested index for receiving memory state - * @param resp Optional: pointer to an already allocated object to contain information in case of failure - * @return Status code of the operation: InferenceEngine::OK (0) for success, OUT_OF_BOUNDS (-6) no memory state for - * given index - */ - virtual StatusCode QueryState(IVariableState::Ptr& pState, size_t idx, ResponseDesc* resp) noexcept = 0; - IE_SUPPRESS_DEPRECATED_END - protected: ~IInferRequest() = default; }; diff --git a/inference-engine/include/ie_imemory_state.hpp b/inference-engine/include/ie_imemory_state.hpp deleted file mode 100644 index 7f3ef99cbd11d3..00000000000000 --- a/inference-engine/include/ie_imemory_state.hpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -/** - * @brief a header file for IVariableState interface - * - * @file ie_imemory_state.hpp - */ - -#pragma once - -#include - -#include "ie_blob.h" -#include "ie_common.h" - -namespace InferenceEngine { - -/** - * @deprecated Use InferenceEngine::VariableState C++ wrapper instead - * @interface IVariableState - * @brief Manages data for reset operations - */ -class INFERENCE_ENGINE_DEPRECATED("InferenceEngine::") IVariableState { -public: - IE_SUPPRESS_DEPRECATED_START - /** - * @brief A shared pointer to the IVariableState interface - */ - using Ptr = std::shared_ptr; - IE_SUPPRESS_DEPRECATED_END - - /** - * @brief Gets name of current variable state, if length of array is not enough name is truncated by len, null - * terminator is inserted as well. As variable state name `variable_id` from according `ReadValue` used. - * - * @param name preallocated buffer for receiving name - * @param len Length of the buffer - * @param resp Optional: pointer to an already allocated object to contain information in case of failure - * @return Status code of the operation: InferenceEngine::OK (0) for success - */ - virtual StatusCode GetName(char* name, size_t len, ResponseDesc* resp) const noexcept = 0; - - /** - * @brief Reset internal variable state for relevant infer request, to a value specified as default for according ReadValue node - * - * @param resp Optional: pointer to an already allocated object to contain information in case of failure - * @return Status code of the operation: InferenceEngine::OK (0) for success* - */ - virtual StatusCode Reset(ResponseDesc* resp) noexcept = 0; - - /** - * @brief Sets the new state for the next inference. - * - * This method can fail if Blob size does not match the internal state size or precision - * - * @param newState The data to use as new state - * @param resp Optional: pointer to an already allocated object to contain information in case of failure - * @return Status code of the operation: InferenceEngine::OK (0) for success - */ - virtual StatusCode SetState(Blob::Ptr newState, ResponseDesc* resp) noexcept = 0; - - /** - * @brief Returns the value of the variable state. - * - * @param state A reference to a blob containing a variable state - * @param resp Optional: pointer to an already allocated object to contain information in case of failure - * @return Status code of the operation: InferenceEngine::OK (0) for success - */ - INFERENCE_ENGINE_DEPRECATED("Use GetState function instead") - virtual StatusCode GetLastState(Blob::CPtr& state, ResponseDesc* resp) const noexcept { - return GetState(state, resp); - } - - /** - * @brief Returns the value of the variable state. - * - * @param state A reference to a blob containing a variable state - * @param resp Optional: pointer to an already allocated object to contain information in case of failure - * @return Status code of the operation: InferenceEngine::OK (0) for success - */ - virtual StatusCode GetState(Blob::CPtr& state, ResponseDesc* resp) const noexcept = 0; -}; - -IE_SUPPRESS_DEPRECATED_START - -/** - * @brief For compatibility reasons. - */ -using IMemoryState = IVariableState; - -IE_SUPPRESS_DEPRECATED_END - -} // namespace InferenceEngine \ No newline at end of file diff --git a/inference-engine/include/ie_parameter.hpp b/inference-engine/include/ie_parameter.hpp index 1343f89db32939..4aa6760d474874 100644 --- a/inference-engine/include/ie_parameter.hpp +++ b/inference-engine/include/ie_parameter.hpp @@ -49,26 +49,6 @@ class INFERENCE_ENGINE_API_CLASS(Parameter) { std::swap(ptr, parameter.ptr); } - /** - * @deprecated Use ngraph::Variant directly - * @brief Creates parameter from variant. - * This method creates empty parameter if variant doesn't contain Parameter - * - * @param var ngraph variant - */ - INFERENCE_ENGINE_DEPRECATED("Use ngraph::Variant directly") - Parameter(const std::shared_ptr& var); - - /** - * @deprecated Use ngraph::Variant directly - * @brief Creates parameter from variant. - * This method creates empty parameter if variant doesn't contain Parameter - * - * @param var ngraph variant - */ - INFERENCE_ENGINE_DEPRECATED("Use ngraph::Variant directly") - Parameter(std::shared_ptr& var); - /** * @brief Copy constructor * @@ -86,7 +66,8 @@ class INFERENCE_ENGINE_API_CLASS(Parameter) { * @param parameter object */ template ::type, Parameter>::value>::type> + typename = typename std::enable_if::type, Parameter>::value && + !std::is_abstract::type>::value>::type> Parameter(T&& parameter) { // NOLINT static_assert(!std::is_same::type, Parameter>::value, "To prevent recursion"); ptr = new RealData::type>(std::forward(parameter)); @@ -203,28 +184,6 @@ class INFERENCE_ENGINE_API_CLASS(Parameter) { return dyn_cast::type>(ptr); } - /** - * @deprecated Use ngraph::Variant directly - * @brief Converts parameter to shared pointer on ngraph::Variant - * - * @return shared pointer on ngraph::Variant - */ - INFERENCE_ENGINE_DEPRECATED("Use ngraph::Variant directly") - std::shared_ptr asVariant() const; - - /** - * @deprecated Use ngraph::Variant directly - * @brief Casts to shared pointer on ngraph::Variant - * - * @return shared pointer on ngraph::Variant - */ - INFERENCE_ENGINE_DEPRECATED("Use ngraph::Variant directly") - operator std::shared_ptr() const { - IE_SUPPRESS_DEPRECATED_START - return asVariant(); - IE_SUPPRESS_DEPRECATED_END - } - /** * Dynamic cast to specified type * @tparam T type @@ -254,6 +213,21 @@ class INFERENCE_ENGINE_API_CLASS(Parameter) { return !(*this == rhs); } + /** + * @brief Prints underlying object to the given output stream. + * Uses operator<< if it is defined, leaves stream unchanged otherwise. + * In case of empty parameter or nullptr stream immediately returns. + * + * @param object Object to be printed to the given output stream. + * @param stream Output stream object will be printed to. + */ + friend void PrintTo(const Parameter& object, std::ostream* stream) { + if (object.empty() || !stream) { + return; + } + object.ptr->print(*stream); + } + private: template struct CheckOperatorEqual { @@ -273,6 +247,24 @@ class INFERENCE_ENGINE_API_CLASS(Parameter) { template struct HasOperatorEqual : CheckOperatorEqual::type {}; + template + struct CheckOutputStreamOperator { + template + static auto test(W*) -> decltype(std::declval() << std::declval(), std::true_type()) { + return {}; + } + + template + static auto test(...) -> std::false_type { + return {}; + } + + using type = typename std::is_same(nullptr))>::type; + }; + + template + struct HasOutputStreamOperator : CheckOutputStreamOperator::type {}; + struct Any { #ifdef __ANDROID__ virtual ~Any(); @@ -282,6 +274,7 @@ class INFERENCE_ENGINE_API_CLASS(Parameter) { virtual bool is(const std::type_info&) const = 0; virtual Any* copy() const = 0; virtual bool operator==(const Any& rhs) const = 0; + virtual void print(std::ostream&) const = 0; }; template @@ -318,6 +311,20 @@ class INFERENCE_ENGINE_API_CLASS(Parameter) { bool operator==(const Any& rhs) const override { return rhs.is(typeid(T)) && equal(*this, rhs); } + + template + typename std::enable_if::value, void>::type + print(std::ostream& stream, const U& object) const {} + + template + typename std::enable_if::value, void>::type + print(std::ostream& stream, const U& object) const { + stream << object; + } + + void print(std::ostream& stream) const override { + print(stream, get()); + } }; template diff --git a/inference-engine/include/ie_transformations.hpp b/inference-engine/include/ie_transformations.hpp index 6691fa74daeedd..62b92a995781f3 100644 --- a/inference-engine/include/ie_transformations.hpp +++ b/inference-engine/include/ie_transformations.hpp @@ -16,6 +16,7 @@ namespace InferenceEngine { /** + * @deprecated Use InferenceEngine::lowLatency2 instead * @brief The transformation finds all TensorIterator layers in the network, processes all back * edges that describe a connection between Result and Parameter of the TensorIterator body, * and inserts ReadValue layer between Parameter and the next layers after this Parameter, @@ -50,7 +51,41 @@ namespace InferenceEngine { * network->infer (...) // Using stored states, calculating new values for states. * * @param network A network to apply LowLatency transformation - * * */ + +INFERENCE_ENGINE_DEPRECATED("This transformation will be removed in 2023.1. " + "Use InferenceEngine::lowLatency2 instead.") INFERENCE_ENGINE_API_CPP(void) LowLatency(InferenceEngine::CNNNetwork& network); + + +/** + * @brief The transformation finds all TensorIterator/Loop layers in the network, + * processes all back edges that describe a connection between Result and Parameter + * of the TensorIterator/Loop bodies,and inserts ReadValue and Assign layers at the + * input and output corresponding to this back edge. + * Supported platforms: CPU, GNA. + * + * The example below describes the changes made by the transformation + * [] - TensorIterator body + * () - new layer + * BE - back-edge + * + * before applying the transformation: + * -> input1[BE_1 -> Parameter -> Layers ... -> Result -> BE_1 ]output1-> + * + * after applying the transformation: + * ->(ReadValue)-> input1[BE_1 ->Parameter->Layers ...->Result->BE_1]output1 ->(Assign) + * \ + * ->... + * After applying the transformation, the resulting network can be inferred + * step by step, the states will store between inferences. + * @param network A network to apply LowLatency transformation + * @param use_const_initializer Changes the type of the initializing subgraph for ReadValue operations. + If "true", then the transformation inserts Constant before ReadValue operation. + If "false, then the transformation leaves existed initializing subgraph for ReadValue operation. + * Loop operation by a given number. Does not affect TensorIterators. + */ +INFERENCE_ENGINE_API_CPP(void) lowLatency2(InferenceEngine::CNNNetwork& network, + bool use_const_initializer = true); + } // namespace InferenceEngine diff --git a/inference-engine/include/ie_unicode.hpp b/inference-engine/include/ie_unicode.hpp deleted file mode 100644 index dc943d6f558be5..00000000000000 --- a/inference-engine/include/ie_unicode.hpp +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -/** - * @brief This is a header file with common inference engine definitions - * - * @file ie_unicode.hpp - */ -#pragma once - -#include -#include -#include -#include -#include -#include - -#ifdef UNICODE -typedef wchar_t tchar; -typedef std::wstring file_name_t; -#else -typedef char tchar; -typedef std::string file_name_t; -#endif - -namespace InferenceEngine { - -/** - * @deprecated Use OS-native conversion utilities - * @brief Conversion from possibly-wide character string to a single-byte chain. - * @param str A possibly-wide character string - * @return A single-byte character string - */ -INFERENCE_ENGINE_DEPRECATED("Use OS-native conversion utilities") -inline std::string fileNameToString(const file_name_t& str) { -#ifdef UNICODE - size_t maxlen = (str.length() + 1) * sizeof(wchar_t) / sizeof(char); - std::vector mbstr(maxlen); - mbstr[0] = 0; - std::wcstombs(&mbstr[0], str.c_str(), maxlen); - std::string res = std::string(&mbstr[0]); - return res; -#else - return str; -#endif -} - -/** - * @deprecated Use OS-native conversion utilities - * @brief Conversion from single-byte character string to a possibly-wide one - * @param str A single-byte character string - * @return A possibly-wide character string - */ -INFERENCE_ENGINE_DEPRECATED("Use OS-native conversion utilities") -inline file_name_t stringToFileName(const std::string& str) { -#ifdef UNICODE - size_t maxlen = str.length() + 1; - std::vector wcstr(maxlen); - wcstr[0] = 0; - std::mbstowcs(&wcstr[0], str.c_str(), maxlen); - file_name_t res = file_name_t(&wcstr[0]); - return res; -#else - return str; -#endif -} - -} // namespace InferenceEngine diff --git a/inference-engine/include/ie_version.hpp b/inference-engine/include/ie_version.hpp index 13215d0b68d253..10e649a09d32f8 100644 --- a/inference-engine/include/ie_version.hpp +++ b/inference-engine/include/ie_version.hpp @@ -20,8 +20,8 @@ * @brief Defines Inference Engine patch version */ -#define IE_VERSION_MAJOR 2021 -#define IE_VERSION_MINOR 4 +#define IE_VERSION_MAJOR 2022 +#define IE_VERSION_MINOR 1 #define IE_VERSION_PATCH 0 #include "ie_api.h" diff --git a/inference-engine/samples/CMakeLists.txt b/inference-engine/samples/CMakeLists.txt index 7924c56779dcdf..fe3952f5779789 100644 --- a/inference-engine/samples/CMakeLists.txt +++ b/inference-engine/samples/CMakeLists.txt @@ -110,19 +110,17 @@ if(NOT DEFINED CMAKE_CXX_STANDARD) endif() #################################### -set (GFLAGS_IS_SUBPROJECT TRUE) -set (HAVE_SYS_STAT_H 1) -set (HAVE_INTTYPES_H 1) -set (INTTYPES_FORMAT C99) -set (BUILD_TESTING OFF) - -if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/gflags") +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/gflags" AND + NOT DEFINED IE_MAIN_SOURCE_DIR) function(add_gflags) - if(NOT WIN32) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-all") - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-all") - endif() + # common gflags settings + set(GFLAGS_IS_SUBPROJECT TRUE) + set(HAVE_SYS_STAT_H 1) + set(HAVE_INTTYPES_H 1) + set(INTTYPES_FORMAT C99) + set(BUILD_TESTING OFF) set(BUILD_SHARED_LIBS OFF) + add_subdirectory(thirdparty/gflags EXCLUDE_FROM_ALL) set_target_properties(gflags_nothreads_static PROPERTIES FOLDER thirdparty) endfunction() @@ -154,7 +152,7 @@ elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/common/opencv_c_wrapper") endif() # samples build can be switched off during whole IE build -if (IE_MAIN_SOURCE_DIR AND NOT ENABLE_SAMPLES) +if (DEFINED IE_MAIN_SOURCE_DIR AND NOT ENABLE_SAMPLES) return() endif() diff --git a/inference-engine/samples/benchmark_app/inputs_filling.cpp b/inference-engine/samples/benchmark_app/inputs_filling.cpp index e12f7656f173c6..ef8a045279a57a 100644 --- a/inference-engine/samples/benchmark_app/inputs_filling.cpp +++ b/inference-engine/samples/benchmark_app/inputs_filling.cpp @@ -39,6 +39,7 @@ std::vector filterFilesByExtensions(const std::vector& return filtered; } +template void fillBlobImage(Blob::Ptr& inputBlob, const std::vector& filePaths, const size_t& batchSize, const benchmark_app::InputInfo& app_info, const size_t& requestId, const size_t& inputId, const size_t& inputSize) { MemoryBlob::Ptr minput = as(inputBlob); @@ -50,7 +51,7 @@ void fillBlobImage(Blob::Ptr& inputBlob, const std::vector& filePat // locked memory holder should be alive all time while access to its buffer // happens auto minputHolder = minput->wmap(); - auto inputBlobData = minputHolder.as(); + auto inputBlobData = minputHolder.as(); /** Collect images data ptrs **/ std::vector> vreader; @@ -90,7 +91,7 @@ void fillBlobImage(Blob::Ptr& inputBlob, const std::vector& filePat size_t offset = imageId * numChannels * width * height + (((app_info.layout == "NCHW") || (app_info.layout == "CHW")) ? (ch * width * height + h * width + w) : (h * width * numChannels + w * numChannels + ch)); - inputBlobData[offset] = vreader.at(imageId).get()[h * width * numChannels + w * numChannels + ch]; + inputBlobData[offset] = static_cast(vreader.at(imageId).get()[h * width * numChannels + w * numChannels + ch]); } } } @@ -142,7 +143,7 @@ using uniformDistribution = typename std::conditional::value, std::uniform_int_distribution, void>::type>::type; template -void fillBlobRandom(Blob::Ptr& inputBlob, T rand_min = std::numeric_limits::min(), T rand_max = std::numeric_limits::max()) { +void fillBlobRandom(Blob::Ptr& inputBlob, T rand_min = std::numeric_limits::min(), T rand_max = std::numeric_limits::max()) { MemoryBlob::Ptr minput = as(inputBlob); if (!minput) { IE_THROW() << "We expect inputBlob to be inherited from MemoryBlob in " @@ -270,7 +271,19 @@ void fillBlobs(const std::vector& inputFiles, const size_t& batchSi if (app_info.isImage()) { if (!imageFiles.empty()) { // Fill with Images - fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount); + if (precision == InferenceEngine::Precision::FP32) { + fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount); + } else if (precision == InferenceEngine::Precision::FP16) { + fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount); + } else if (precision == InferenceEngine::Precision::I32) { + fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount); + } else if (precision == InferenceEngine::Precision::I64) { + fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount); + } else if (precision == InferenceEngine::Precision::U8) { + fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount); + } else { + IE_THROW() << "Input precision is not supported for " << item.first; + } continue; } } else { diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp index 849dc05ad3344a..2a5252ba443a85 100644 --- a/inference-engine/samples/benchmark_app/main.cpp +++ b/inference-engine/samples/benchmark_app/main.cpp @@ -4,8 +4,8 @@ #include #include -#include #include +#include #include #include #include @@ -277,12 +277,12 @@ int main(int argc, char* argv[]) { setThroughputStreams(); if ((device_name.find("MULTI") != std::string::npos) && (device_name.find("CPU") != std::string::npos)) { - slog::warn << "Turn on GPU trottling. Multi-device execution with " - "the CPU + GPU performs best with GPU trottling hint," + slog::warn << "Turn on GPU throttling. Multi-device execution with " + "the CPU + GPU performs best with GPU throttling hint, " << "which releases another CPU thread (that is otherwise " "used by the GPU driver for active polling)" << slog::endl; - device_config[CLDNN_CONFIG_KEY(PLUGIN_THROTTLE)] = "1"; + device_config[GPU_CONFIG_KEY(PLUGIN_THROTTLE)] = "1"; } } else if (device == "MYRIAD") { device_config[CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING); diff --git a/inference-engine/samples/hello_query_device/README.md b/inference-engine/samples/hello_query_device/README.md index a185147f8ec50e..059077c48ad6b4 100644 --- a/inference-engine/samples/hello_query_device/README.md +++ b/inference-engine/samples/hello_query_device/README.md @@ -63,20 +63,20 @@ Available devices: SUPPORTED_METRICS : [ AVAILABLE_DEVICES SUPPORTED_METRICS FULL_DEVICE_NAME OPTIMIZATION_CAPABILITIES SUPPORTED_CONFIG_KEYS RANGE_FOR_ASYNC_INFER_REQUESTS RANGE_FOR_STREAMS ] FULL_DEVICE_NAME : Intel(R) UHD Graphics 620 (iGPU) OPTIMIZATION_CAPABILITIES : [ FP32 BIN FP16 ] - SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR CLDNN_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR CLDNN_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ] + SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR GPU_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR GPU_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ] RANGE_FOR_ASYNC_INFER_REQUESTS : { 1, 2, 1 } RANGE_FOR_STREAMS : { 1, 2 } Default values for device configuration keys: CACHE_DIR : "" CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS : YES CLDNN_GRAPH_DUMPS_DIR : "" - CLDNN_MAX_NUM_THREADS : 8 CLDNN_MEM_POOL : YES CLDNN_NV12_TWO_INPUTS : NO CLDNN_PLUGIN_PRIORITY : 0 CLDNN_PLUGIN_THROTTLE : 0 CLDNN_SOURCES_DUMPS_DIR : "" - CLDNN_ENABLE_LOOP_UNROLLING : YES + GPU_MAX_NUM_THREADS : 8 + GPU_ENABLE_LOOP_UNROLLING : YES CONFIG_FILE : "" DEVICE_ID : "" DUMP_KERNELS : NO diff --git a/inference-engine/samples/ngraph_function_creation_sample/README.md b/inference-engine/samples/ngraph_function_creation_sample/README.md index 1410241c3a595b..9f7b4f8d4337ee 100644 --- a/inference-engine/samples/ngraph_function_creation_sample/README.md +++ b/inference-engine/samples/ngraph_function_creation_sample/README.md @@ -1,6 +1,6 @@ # nGraph Function Creation C++ Sample {#openvino_inference_engine_samples_ngraph_function_creation_sample_README} -This sample demonstrates how to execute an synchronous inference using [nGraph function feature](../../../docs/nGraph_DG/build_function.md) to create a network, which uses weights from LeNet classification network. +This sample demonstrates how to execute an synchronous inference using [nGraph function feature](../../../docs/nGraph_DG/build_function.md) to create a network, which uses weights from LeNet classification network, which is known to work well on digit classification tasks. The sample supports only single-channel `ubyte` images as an input. diff --git a/inference-engine/samples/speech_sample/fileutils.cpp b/inference-engine/samples/speech_sample/fileutils.cpp index f3211a21a4b450..102cca25297444 100644 --- a/inference-engine/samples/speech_sample/fileutils.cpp +++ b/inference-engine/samples/speech_sample/fileutils.cpp @@ -108,15 +108,18 @@ void NumpyFile::GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, u cnpy::npz_t my_npz1 = cnpy::npz_load(fileName); auto it = my_npz1.begin(); std::advance(it, numArrayToFindSize); - - numArrays = my_npz1.size(); - cnpy::NpyArray my_npy = it->second; - numMemoryBytes = my_npy.data_holder->size(); - - if (ptrNumArrays != NULL) - *ptrNumArrays = numArrays; - if (ptrNumMemoryBytes != NULL) - *ptrNumMemoryBytes = numMemoryBytes; + if (it != my_npz1.end()) { + numArrays = my_npz1.size(); + cnpy::NpyArray my_npy = it->second; + numMemoryBytes = my_npy.data_holder->size(); + + if (ptrNumArrays != NULL) + *ptrNumArrays = numArrays; + if (ptrNumMemoryBytes != NULL) + *ptrNumMemoryBytes = numMemoryBytes; + } else { + throw std::runtime_error(std::string("Failed to get info %s GetFileInfo()!\n") + fileName); + } } void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector& memory, uint32_t* ptrNumRows, @@ -124,16 +127,20 @@ void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string& cnpy::npz_t my_npz1 = cnpy::npz_load(fileName); auto it = my_npz1.begin(); std::advance(it, arrayIndex); - ptrName = it->first; - cnpy::NpyArray my_npy = it->second; - *ptrNumRows = my_npy.shape[0]; - *ptrNumColumns = my_npy.shape[1]; + if (it != my_npz1.end()) { + ptrName = it->first; + cnpy::NpyArray my_npy = it->second; + *ptrNumRows = my_npy.shape[0]; + *ptrNumColumns = my_npy.shape[1]; + + for (size_t i = 0; i < my_npy.data_holder->size(); i++) { + memory.at(i) = my_npy.data_holder->at(i); + } - for (size_t i = 0; i < my_npy.data_holder->size(); i++) { - memory.at(i) = my_npy.data_holder->at(i); + *ptrNumBytesPerElement = sizeof(float); + } else { + throw std::runtime_error(std::string("Failed to open %s for reading in LoadFile()!\n") + fileName); } - - *ptrNumBytesPerElement = sizeof(float); } void NumpyFile::SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns) { diff --git a/inference-engine/samples/speech_sample/main.cpp b/inference-engine/samples/speech_sample/main.cpp index 2b9131774ad557..57db61a8e9e8cd 100644 --- a/inference-engine/samples/speech_sample/main.cpp +++ b/inference-engine/samples/speech_sample/main.cpp @@ -236,7 +236,8 @@ float getGnaFrequencyMHz() { const uint8_t cannon_lake_model = 102; const uint8_t gemini_lake_model = 122; const uint8_t ice_lake_model = 126; - const uint8_t next_model = 140; + const uint8_t tgl_model = 140; + const uint8_t next_model = 151; native_cpuid(&eax, &ebx, &ecx, &edx); family = (eax >> 8) & 0xF; @@ -254,6 +255,7 @@ float getGnaFrequencyMHz() { switch (model) { case cannon_lake_model: case ice_lake_model: + case tgl_model: case next_model: return 400; case gemini_lake_model: @@ -287,13 +289,14 @@ void printReferenceCompareResults(score_error_t const& totalError, size_t frames /** * @brief Print a report on the performance counts * @param utterancePerfMap reference to a map to store performance counters - * @param callsNum frame index + * @param numberOfFrames number of frames * @param stream output stream * @param fullDeviceName full device name string + * @param numberOfFramesOnHw number of frames delivered to GNA HW * @return none. */ -void printPerformanceCounters(std::map const& utterancePerfMap, size_t callsNum, std::ostream& stream, - std::string fullDeviceName) { +void printPerformanceCounters(std::map const& utterancePerfMap, size_t numberOfFrames, + std::ostream& stream, std::string fullDeviceName, const uint64_t numberOfFramesOnHw) { #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64) stream << std::endl << "Performance counts:" << std::endl; stream << std::setw(10) << std::right << "" @@ -305,29 +308,29 @@ void printPerformanceCounters(std::map(it.second.realTime_uSec); - float call_units = current_units / callsNum; - // if GNA HW counters - // get frequency of GNA module - float freq = getGnaFrequencyMHz(); - current_units /= freq * 1000; - call_units /= freq; + float current_units_us = static_cast(it.second.realTime_uSec) / freq; + float call_units_us = current_units_us / numberOfFrames; if (FLAGS_d.find("GNA") != std::string::npos) { stream << std::setw(30) << std::left << counter_name.substr(4, counter_name.size() - 1); } else { stream << std::setw(30) << std::left << counter_name; } - stream << std::setw(16) << std::right << current_units; - stream << std::setw(21) << std::right << call_units; + stream << std::setw(16) << std::right << current_units_us / 1000; + stream << std::setw(21) << std::right << call_units_us; stream << std::endl; } stream << std::endl; std::cout << std::endl; std::cout << "Full device name: " << fullDeviceName << std::endl; std::cout << std::endl; + stream << "Number of frames delivered to GNA HW: " << numberOfFramesOnHw; + stream << "/" << numberOfFrames; + stream << std::endl; #endif } @@ -346,16 +349,20 @@ void getPerformanceCounters(InferenceEngine::InferRequest& request, std::map const& perfCounters, - std::map& totalPerfCounters) { + std::map& totalPerfCounters, uint64_t& totalRunsOnHw) { + auto runOnHw = false; for (const auto& pair : perfCounters) { totalPerfCounters[pair.first].realTime_uSec += pair.second.realTime_uSec; + runOnHw |= pair.second.realTime_uSec > 0; // if realTime is above zero, that means that a primitive was executed on the device } + totalRunsOnHw += runOnHw; } /** @@ -443,6 +450,7 @@ bool ParseAndCheckCommandLine(int argc, char* argv[]) { "GPU", "GNA_AUTO", "GNA_HW", + "GNA_HW_WITH_SW_FBACK", "GNA_SW_EXACT", "GNA_SW", "GNA_SW_FP32", @@ -829,6 +837,7 @@ int main(int argc, char* argv[]) { /** Work with each utterance **/ for (uint32_t utteranceIndex = 0; utteranceIndex < numUtterances; ++utteranceIndex) { std::map utterancePerfMap; + uint64_t totalNumberOfRunsOnHw = 0; std::string uttName; uint32_t numFrames(0), n(0); std::vector numFrameElementsInput; @@ -984,7 +993,7 @@ int main(int argc, char* argv[]) { // retrieve new counters getPerformanceCounters(inferRequest.inferRequest, callPerfMap); // summarize retrieved counters with all previous - sumPerformanceCounters(callPerfMap, utterancePerfMap); + sumPerformanceCounters(callPerfMap, utterancePerfMap, totalNumberOfRunsOnHw); } } // ----------------------------------------------------------------------------------------------------- @@ -1092,7 +1101,7 @@ int main(int argc, char* argv[]) { std::cout << "Average Infer time per frame:\t\t" << totalTime / static_cast(numFrames) << " ms" << std::endl; if (FLAGS_pc) { // print performance results - printPerformanceCounters(utterancePerfMap, frameIndex, std::cout, getFullDeviceName(ie, FLAGS_d)); + printPerformanceCounters(utterancePerfMap, frameIndex, std::cout, getFullDeviceName(ie, FLAGS_d), totalNumberOfRunsOnHw); } if (!FLAGS_r.empty()) { // print statistical score error diff --git a/inference-engine/samples/speech_sample/speech_sample.hpp b/inference-engine/samples/speech_sample/speech_sample.hpp index cafe4db5c61758..66d3b24a4c5c47 100644 --- a/inference-engine/samples/speech_sample/speech_sample.hpp +++ b/inference-engine/samples/speech_sample/speech_sample.hpp @@ -21,10 +21,9 @@ static const char model_message[] = "Required. Path to an .xml file with a train /// @brief message for assigning cnn calculation to device static const char target_device_message[] = "Optional. Specify a target device to infer on. CPU, GPU, MYRIAD, GNA_AUTO, GNA_HW, " - "GNA_SW_FP32, " + "GNA_HW_WITH_SW_FBACK, GNA_SW_FP32, " "GNA_SW_EXACT and HETERO with combination of GNA as the primary device and CPU" - " as a secondary (e.g. HETERO:GNA,CPU) are supported. The list of available devices is shown " - "below. " + " as a secondary (e.g. HETERO:GNA,CPU) are supported. " "The sample will look for a suitable plugin for device specified."; /// @brief message for execution target diff --git a/inference-engine/samples/thirdparty/gflags b/inference-engine/samples/thirdparty/gflags deleted file mode 160000 index 46f73f88b18aee..00000000000000 --- a/inference-engine/samples/thirdparty/gflags +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 46f73f88b18aee341538c0dfc22b1710a6abedef diff --git a/inference-engine/scripts/run_tests_myriad_multistick.sh b/inference-engine/scripts/run_tests_myriad_multistick.sh index 468817c6f75109..73befb9b366e2d 100755 --- a/inference-engine/scripts/run_tests_myriad_multistick.sh +++ b/inference-engine/scripts/run_tests_myriad_multistick.sh @@ -55,7 +55,7 @@ fi if [[ "${APPS_TO_RUN}" -ge 4 ]] ; then # For more then 4 multidevice testing - for (( VAR = 4; VAR <= ${APPS_TO_RUN}; ++VAR )); do + for (( VAR = 4; VAR <= APPS_TO_RUN; ++VAR )); do ./${APP_NAME} --gtest_filter=*VPURegTest*YOLO*myriad* & pids+=" $!" done diff --git a/inference-engine/src/auto_plugin/auto_exec_network.cpp b/inference-engine/src/auto_plugin/auto_exec_network.cpp index 353196a88d4a41..49b0963c04d35e 100644 --- a/inference-engine/src/auto_plugin/auto_exec_network.cpp +++ b/inference-engine/src/auto_plugin/auto_exec_network.cpp @@ -3,10 +3,8 @@ // #include -#include #include #include -#include #include "ie_metric_helpers.hpp" #include "auto_exec_network.hpp" @@ -15,8 +13,8 @@ namespace AutoPlugin { using namespace InferenceEngine; -AutoExecutableNetwork::AutoExecutableNetwork(const SoExecutableNetworkInternal& network) : - _network(network) { +AutoExecutableNetwork::AutoExecutableNetwork(const SoExecutableNetworkInternal& network, bool enablePerfCount) : + _network(network), _enablePerfCount(enablePerfCount) { } AutoExecutableNetwork::~AutoExecutableNetwork() = default; @@ -24,7 +22,7 @@ AutoExecutableNetwork::~AutoExecutableNetwork() = default; InferenceEngine::IInferRequestInternal::Ptr AutoExecutableNetwork::CreateInferRequestImpl(InputsDataMap networkInputs, OutputsDataMap networkOutputs) { SoIInferRequestInternal inferRequest = {_network, _network->CreateInferRequest()}; - return std::make_shared(_networkInputs, _networkOutputs, inferRequest); + return std::make_shared(_networkInputs, _networkOutputs, inferRequest, _enablePerfCount); } void AutoExecutableNetwork::Export(std::ostream& networkModel) { diff --git a/inference-engine/src/auto_plugin/auto_exec_network.hpp b/inference-engine/src/auto_plugin/auto_exec_network.hpp index a39478b19a753a..e29970711ebc15 100644 --- a/inference-engine/src/auto_plugin/auto_exec_network.hpp +++ b/inference-engine/src/auto_plugin/auto_exec_network.hpp @@ -19,16 +19,11 @@ namespace AutoPlugin { using DeviceName = std::string; -struct DeviceInformation { - DeviceName deviceName; - std::map config; -}; - class AutoExecutableNetwork : public InferenceEngine::IExecutableNetworkInternal { public: using Ptr = std::shared_ptr; - explicit AutoExecutableNetwork(const InferenceEngine::SoExecutableNetworkInternal& network); + explicit AutoExecutableNetwork(const InferenceEngine::SoExecutableNetworkInternal& network, bool enablePerfCount); void Export(std::ostream& networkModel) override; InferenceEngine::RemoteContext::Ptr GetContext() const override; @@ -43,6 +38,7 @@ class AutoExecutableNetwork : public InferenceEngine::IExecutableNetworkInternal private: InferenceEngine::SoExecutableNetworkInternal _network; + bool _enablePerfCount; }; } // namespace AutoPlugin diff --git a/inference-engine/src/auto_plugin/auto_infer_request.cpp b/inference-engine/src/auto_plugin/auto_infer_request.cpp index f0777409830d68..46d603187152f7 100644 --- a/inference-engine/src/auto_plugin/auto_infer_request.cpp +++ b/inference-engine/src/auto_plugin/auto_infer_request.cpp @@ -11,13 +11,23 @@ namespace AutoPlugin { AutoInferRequest::AutoInferRequest(const InputsDataMap& networkInputs, const OutputsDataMap& networkOutputs, - const SoIInferRequestInternal& inferRequest) + const SoIInferRequestInternal& inferRequest, + bool enablePerfCount) : IInferRequestInternal(networkInputs, networkOutputs) - , _inferRequest(inferRequest) { + , _inferRequest(inferRequest) + , _enablePerfCount(enablePerfCount) { } std::map AutoInferRequest::GetPerformanceCounts() const { - return _inferRequest->GetPerformanceCounts(); + if (_enablePerfCount) { + try { + return _inferRequest->GetPerformanceCounts(); + } catch (...) { + return {}; + } + } else { + return {}; + } } void AutoInferRequest::InferImpl() { diff --git a/inference-engine/src/auto_plugin/auto_infer_request.hpp b/inference-engine/src/auto_plugin/auto_infer_request.hpp index 1ccaf0093b27b3..c97b2fa5aedd49 100644 --- a/inference-engine/src/auto_plugin/auto_infer_request.hpp +++ b/inference-engine/src/auto_plugin/auto_infer_request.hpp @@ -24,7 +24,8 @@ class AutoInferRequest : public InferenceEngine::IInferRequestInternal { using Ptr = std::shared_ptr; explicit AutoInferRequest(const InferenceEngine::InputsDataMap& networkInputs, const InferenceEngine::OutputsDataMap& networkOutputs, - const InferenceEngine::SoIInferRequestInternal& inferRequest); + const InferenceEngine::SoIInferRequestInternal& inferRequest, + bool enablePerfCount); std::map GetPerformanceCounts() const override; void InferImpl() override; void SetBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) override; @@ -37,6 +38,7 @@ class AutoInferRequest : public InferenceEngine::IInferRequestInternal { private: InferenceEngine::SoIInferRequestInternal _inferRequest; + bool _enablePerfCount; }; } // namespace AutoPlugin diff --git a/inference-engine/src/auto_plugin/auto_plugin.cpp b/inference-engine/src/auto_plugin/auto_plugin.cpp index 1fc20063575c5c..274fa9d224f23f 100644 --- a/inference-engine/src/auto_plugin/auto_plugin.cpp +++ b/inference-engine/src/auto_plugin/auto_plugin.cpp @@ -75,11 +75,11 @@ IE::QueryNetworkResult AutoInferencePlugin::QueryNetwork(const IE::CNNNetwork& n } auto fullConfig = mergeConfigs(_config, config); - auto metaDevices = GetDeviceChoice(fullConfig); + auto metaDevices = GetDeviceList(fullConfig); std::unordered_set supportedLayers; for (auto&& value : metaDevices) { try { - auto deviceQr = GetCore()->QueryNetwork(network, value.deviceName, value.config); + auto deviceQr = GetCore()->QueryNetwork(network, value, {}); std::unordered_set deviceSupportedLayers; for (auto &&layerQr : deviceQr.supportedLayersMap) { deviceSupportedLayers.emplace(layerQr.first); @@ -111,7 +111,19 @@ IE::Parameter AutoInferencePlugin::GetConfig(const std::string& name, void AutoInferencePlugin::SetConfig(const ConfigType& config) { for (auto && kvp : config) { - _config[kvp.first] = kvp.second; + if (kvp.first.find("AUTO_") == 0) { + _config[kvp.first] = kvp.second; + } else if (kvp.first == IE::PluginConfigParams::KEY_PERF_COUNT) { + if (kvp.second == IE::PluginConfigParams::YES || + kvp.second == IE::PluginConfigParams::NO) { + _config[kvp.first] = kvp.second; + } else { + IE_THROW() << "Unsupported config value: " << kvp.second + << " for key: " << kvp.first; + } + } else { + IE_THROW() << "Unsupported config key: " << kvp.first; + } } } @@ -128,7 +140,10 @@ IE::Parameter AutoInferencePlugin::GetMetric(const std::string& name, std::string device_name = {"Inference Engine AUTO device"}; IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, device_name); } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) { - std::vector configKeys; + std::vector configKeys = { + IE::KEY_AUTO_DEVICE_LIST, + IE::PluginConfigParams::KEY_PERF_COUNT + }; IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys); } else if (name == METRIC_KEY(OPTIMIZATION_CAPABILITIES)) { std::vector capabilities = GetOptimizationCapabilities(options); @@ -139,42 +154,21 @@ IE::Parameter AutoInferencePlugin::GetMetric(const std::string& name, } //////////////////////////////////// private & protected functions /////////////////// -std::vector AutoInferencePlugin::GetDeviceChoice(const ConfigType& config) const { - std::vector metaDevices; - std::vector availableDevices; +std::vector AutoInferencePlugin::GetDeviceList(const ConfigType& config) const { + std::vector deviceList; auto deviceListConfig = config.find(IE::KEY_AUTO_DEVICE_LIST); if (deviceListConfig == config.end()) { - availableDevices = GetCore()->GetAvailableDevices(); + deviceList = GetCore()->GetAvailableDevices(); } else { - availableDevices = IE::DeviceIDParser::getHeteroDevices(deviceListConfig->second); + deviceList = IE::DeviceIDParser::getHeteroDevices(deviceListConfig->second); } - auto getDeviceConfig = [&] (const DeviceName & deviceWithID) { - IE::DeviceIDParser deviceParser(deviceWithID); - std::string deviceName = deviceParser.getDeviceName(); - ConfigType tconfig = config; - - // set device ID if any - std::string deviceIDLocal = deviceParser.getDeviceID(); - if (!deviceIDLocal.empty()) { - tconfig[IE::PluginConfigParams::KEY_DEVICE_ID] = deviceIDLocal; - } - - return GetSupportedConfig(tconfig, deviceName); - }; - - for (auto && d : availableDevices) { - if (d != _pluginName) { - metaDevices.push_back({ d, getDeviceConfig(d)}); - } - } - - if (metaDevices.empty()) { + if (deviceList.empty()) { IE_THROW() << "Please, check environment due to no supported devices can be used"; } - return metaDevices; + return deviceList; } std::vector AutoInferencePlugin::GetOptimizationCapabilities(const std::map & options) const { @@ -215,7 +209,21 @@ ConfigType AutoInferencePlugin::GetSupportedConfig(const ConfigType& config, return supportedConfig; } -DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector& metaDevices, const std::string& networkPrecision) { +void AutoInferencePlugin::CheckConfig(const ConfigType& config) { + std::vector supportedConfigKeys = GetMetric(METRIC_KEY(SUPPORTED_CONFIG_KEYS), {}); + for (auto&& c : config) { + auto itKey = std::find(supportedConfigKeys.begin(), supportedConfigKeys.end(), c.first); + if (supportedConfigKeys.end() == itKey) { + // CVS-57233 + if (c.first.find("AUTO_") == 0) { + continue; + } + IE_THROW() << "AUTO plugin doesn't support config key " << c.first; + } + } +} + +DeviceName AutoInferencePlugin::SelectDevice(const std::vector& metaDevices, const std::string& networkPrecision) { if (metaDevices.empty()) { IE_THROW(NotFound) << "No available device to select in AUTO plugin"; } @@ -223,15 +231,15 @@ DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector CPU; - std::vector GPU; + std::vector CPU; + std::vector GPU; for (auto& item : metaDevices) { - if (item.deviceName.find("CPU") == 0) { + if (item.find("CPU") == 0) { CPU.push_back(item); continue; } - if (item.deviceName.find("GPU") == 0) { + if (item.find("GPU") == 0) { GPU.push_back(item); continue; } @@ -242,10 +250,10 @@ DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector GPU.1 > GPU.0 > GPU, so we always choose the GPU[0] as best device - std::sort(GPU.begin(), GPU.end(), [](const DeviceInformation& a, const DeviceInformation& b)->bool{return b.deviceName < a.deviceName;}); + std::sort(GPU.begin(), GPU.end(), [](const DeviceName& a, const DeviceName& b)->bool{return b < a;}); for (auto&& item : GPU) { - std::vector capability = GetCore()->GetMetric(item.deviceName, METRIC_KEY(OPTIMIZATION_CAPABILITIES)); + std::vector capability = GetCore()->GetMetric(item, METRIC_KEY(OPTIMIZATION_CAPABILITIES)); auto res = std::find(capability.begin(), capability.end(), networkPrecision); if (res != capability.end()) { return item; diff --git a/inference-engine/src/auto_plugin/auto_plugin.hpp b/inference-engine/src/auto_plugin/auto_plugin.hpp index af42e9f0ef7c28..858ee2143fd06f 100644 --- a/inference-engine/src/auto_plugin/auto_plugin.hpp +++ b/inference-engine/src/auto_plugin/auto_plugin.hpp @@ -30,10 +30,11 @@ class AutoInferencePlugin : public IE::IInferencePlugin { void SetConfig(const ConfigType& config) override; private: - std::vector GetDeviceChoice(const ConfigType& config) const; + std::vector GetDeviceList(const ConfigType& config) const; std::vector GetOptimizationCapabilities(const std::map& options) const; - DeviceInformation SelectDevice(const std::vector& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32)); - ConfigType GetSupportedConfig(const ConfigType& config, const AutoPlugin::DeviceName & deviceName) const; + DeviceName SelectDevice(const std::vector& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32)); + ConfigType GetSupportedConfig(const ConfigType& config, const DeviceName & deviceName) const; + void CheckConfig(const ConfigType& config); static ConfigType mergeConfigs(ConfigType config, const ConfigType& local); template @@ -41,18 +42,21 @@ class AutoInferencePlugin : public IE::IInferencePlugin { if (GetCore() == nullptr) { IE_THROW() << "Please, work with AUTO device via InferencEngine::Core object"; } + + CheckConfig(config); + auto fullConfig = mergeConfigs(_config, config); - auto metaDevices = GetDeviceChoice(fullConfig); - DeviceInformation selectedDevice; + auto metaDevices = GetDeviceList(fullConfig); + DeviceName selectedDevice; IE::SoExecutableNetworkInternal executableNetwork; while (!metaDevices.empty()) { selectedDevice = SelectDevice(metaDevices, networkPrecision); try { - executableNetwork = GetCore()->LoadNetwork(param, selectedDevice.deviceName, selectedDevice.config); + executableNetwork = GetCore()->LoadNetwork(param, selectedDevice, {}); break; } catch (...) { auto eraseDevice = std::find_if(metaDevices.begin(), metaDevices.end(), - [=](const DeviceInformation& d)->bool{return d.deviceName == selectedDevice.deviceName;}); + [=](const DeviceName& d)->bool{return d == selectedDevice;}); if (eraseDevice == metaDevices.end()) { IE_THROW() << "Didn't find the selected device name"; } @@ -63,7 +67,10 @@ class AutoInferencePlugin : public IE::IInferencePlugin { if (!executableNetwork) { IE_THROW() << "Failed to load network by AUTO plugin"; } - auto impl = std::make_shared(executableNetwork); + + bool enablePerfCount = fullConfig.find(IE::PluginConfigParams::KEY_PERF_COUNT) != fullConfig.end(); + + auto impl = std::make_shared(executableNetwork, enablePerfCount); if (std::is_same::value) { SetExeNetworkInfo(impl, executableNetwork->GetInputsInfo(), diff --git a/inference-engine/src/cldnn_engine/CMakeLists.txt b/inference-engine/src/cldnn_engine/CMakeLists.txt index ff138843dc45a6..161d6f16a8d2c8 100644 --- a/inference-engine/src/cldnn_engine/CMakeLists.txt +++ b/inference-engine/src/cldnn_engine/CMakeLists.txt @@ -33,7 +33,7 @@ target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} $ ${CLDNN__OCL_ICD_INCDIRS} - ${CLDNN_TOP_FOLDER}) + ${CLDNN_TOP_FOLDER}/api) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) diff --git a/inference-engine/src/cldnn_engine/cldnn_common_utils.h b/inference-engine/src/cldnn_engine/cldnn_common_utils.h index c374a71a4653f3..f41f2d8e134b1c 100644 --- a/inference-engine/src/cldnn_engine/cldnn_common_utils.h +++ b/inference-engine/src/cldnn_engine/cldnn_common_utils.h @@ -5,7 +5,7 @@ #pragma once #include -#include +#include #include "ngraph/type/element_type.hpp" diff --git a/inference-engine/src/cldnn_engine/cldnn_config.cpp b/inference-engine/src/cldnn_engine/cldnn_config.cpp index ff5d96935224cb..3de19bdff87dcc 100644 --- a/inference-engine/src/cldnn_engine/cldnn_config.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "cldnn_config.h" #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "ie_api.h" @@ -39,6 +40,7 @@ static void createDirectory(std::string _path) { } } +IE_SUPPRESS_DEPRECATED_START void Config::UpdateFromMap(const std::map& configMap) { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Config::UpdateFromMap"); for (auto& kvp : configMap) { @@ -69,7 +71,8 @@ void Config::UpdateFromMap(const std::map& configMap) } else { IE_THROW(NotFound) << "Unsupported property value by plugin: " << val; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY) == 0 || + key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) { std::stringstream ss(val); uint32_t uVal(0); ss >> uVal; @@ -93,7 +96,8 @@ void Config::UpdateFromMap(const std::map& configMap) IE_THROW(ParameterMismatch) << "Unsupported queue priority value: " << uVal; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE) == 0 || + key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) { std::stringstream ss(val); uint32_t uVal(0); ss >> uVal; @@ -205,7 +209,8 @@ void Config::UpdateFromMap(const std::map& configMap) } else { IE_THROW(NotFound) << "Unsupported property value by plugin: " << val; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS) == 0 || + key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) { if (val.compare(PluginConfigParams::YES) == 0) { nv12_two_inputs = true; } else if (val.compare(PluginConfigParams::NO) == 0) { @@ -221,7 +226,7 @@ void Config::UpdateFromMap(const std::map& configMap) } else { IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_MAX_NUM_THREADS) == 0) { int max_threads = std::max(1, static_cast(std::thread::hardware_concurrency())); try { int val_i = std::stoi(val); @@ -231,17 +236,17 @@ void Config::UpdateFromMap(const std::map& configMap) n_threads = val_i; } } catch (const std::exception&) { - IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val + IE_THROW() << "Wrong value for property key " << GPUConfigParams::KEY_GPU_MAX_NUM_THREADS << ": " << val << "\nSpecify the number of threads use for build as an integer." << "\nOut of range value will be set as a default value, maximum concurrent threads."; } - } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING) == 0) { + } else if (key.compare(GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING) == 0) { if (val.compare(PluginConfigParams::YES) == 0) { enable_loop_unrolling = true; } else if (val.compare(PluginConfigParams::NO) == 0) { enable_loop_unrolling = false; } else { - IE_THROW(ParameterMismatch) << "Unsupported KEY_CLDNN_ENABLE_LOOP_UNROLLING flag value: " << val; + IE_THROW(ParameterMismatch) << "Unsupported KEY_GPU_ENABLE_LOOP_UNROLLING flag value: " << val; } } else { IE_THROW(NotFound) << "Unsupported property key by plugin: " << key; @@ -297,6 +302,7 @@ void Config::adjustKeyMapValues() { default: break; } key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY] = qp; + key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY] = qp; } { std::string qt = "0"; @@ -307,6 +313,7 @@ void Config::adjustKeyMapValues() { default: break; } key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE] = qt; + key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE] = qt; } { std::string tm = PluginConfigParams::TUNING_DISABLED; @@ -328,11 +335,13 @@ void Config::adjustKeyMapValues() { key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams); key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id; key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = ""; - key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads); + key_config_map[GPUConfigParams::KEY_GPU_MAX_NUM_THREADS] = std::to_string(n_threads); if (enable_loop_unrolling) - key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES; + key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES; else - key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO; + key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO; } +IE_SUPPRESS_DEPRECATED_END + } // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/cldnn_config.h b/inference-engine/src/cldnn_engine/cldnn_config.h index 8c6d5d6c921da8..873c01e2188445 100644 --- a/inference-engine/src/cldnn_engine/cldnn_config.h +++ b/inference-engine/src/cldnn_engine/cldnn_config.h @@ -9,7 +9,7 @@ #include "cldnn_custom_layer.h" -#include +#include namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/cldnn_custom_layer.h b/inference-engine/src/cldnn_engine/cldnn_custom_layer.h index 95a6ff4c5c9f82..cbe41f8831b3f8 100644 --- a/inference-engine/src/cldnn_engine/cldnn_custom_layer.h +++ b/inference-engine/src/cldnn_engine/cldnn_custom_layer.h @@ -10,7 +10,7 @@ #include #include #include "pugixml.hpp" -#include "api/tensor.hpp" +#include "cldnn/runtime/tensor.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 86b9f2e4b9526b..5b3b90eb832597 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -79,7 +79,9 @@ #include "cldnn_executable_network.h" #include "cldnn_custom_layer.h" #include "cldnn_itt.h" -#include "cldnn/cldnn_config.hpp" +#include "gpu/gpu_config.hpp" + +#include "cldnn/runtime/device_query.hpp" #ifdef __linux__ # include @@ -117,13 +119,13 @@ struct clDNNEngine::impl { }; cldnn::device_info clDNNEngine::GetDeviceInfo(const std::map &config) const { - auto device_info = device_map.begin()->second.get_info(); + auto device_info = device_map.begin()->second->get_info(); if (config.find(PluginConfigParams::KEY_DEVICE_ID) != config.end()) { auto val = config.at(PluginConfigParams::KEY_DEVICE_ID); if (device_map.find(val) == device_map.end()) { IE_THROW() << "Invalid device ID: " << val; } - device_info = device_map.at(val).get_info(); + device_info = device_map.at(val)->get_info(); } return device_info; @@ -132,11 +134,8 @@ cldnn::device_info clDNNEngine::GetDeviceInfo(const std::map static bool disableReduceDecomposition(const std::shared_ptr node) { if (auto op = std::dynamic_pointer_cast(node)) { - auto reduction_axes = op->get_reduction_axes().to_vector(); - bool reduce_along_f = op->get_reduction_axes().size() == 1 && std::count(reduction_axes.begin(), reduction_axes.end(), 1) != 0; bool fp16_batch_not_1 = op->get_element_type() == ngraph::element::f16 && op->input(0).get_shape()[0] != 1; - bool can_use_reduce = !reduce_along_f && !fp16_batch_not_1; - return can_use_reduce; + return !fp16_batch_not_1; } return false; } @@ -445,7 +444,8 @@ clDNNEngine::clDNNEngine() : m_defaultContext(nullptr) { RegisterPrimitives(); // try loading clDNN engine and get info from it { - cldnn::device_query device_query; + // Set OCL runtime which should be always available + cldnn::device_query device_query(cldnn::engine_types::ocl, cldnn::runtime_types::ocl); device_map = device_query.get_available_devices(); } // locate global custom kernel config @@ -851,8 +851,8 @@ auto StringRightTrim = [](std::string string, std::string substring, bool case_s }; static float GetGOPS(cldnn::device_info info, cldnn::data_types dt) { - auto freqGHz = info.core_frequency / 1000.f; - auto numEUs = info.cores_count; + auto freqGHz = info.gpu_frequency / 1000.f; + auto numEUs = info.execution_units_count; auto opsPerComputeBlock = 0; auto computeBlockIPC = 1.0f; switch (dt) { @@ -894,8 +894,8 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::mapsecond.get_info() : - device_map.begin()->second.get_info(); + iter->second->get_info() : + device_map.begin()->second->get_info(); if (name == METRIC_KEY(SUPPORTED_METRICS)) { std::vector metrics; @@ -931,7 +931,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map #include #include -#include +#include #include #include #include "cldnn_remote_context.h" @@ -22,7 +22,7 @@ class clDNNEngine : public InferenceEngine::IInferencePlugin, std::shared_ptr _impl; // key: device_id, value: cldnn device - std::map device_map; + std::map device_map; std::mutex engine_mutex; mutable CLDNNRemoteCLContext::Ptr m_defaultContext; diff --git a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp index c2289fa9fb0bde..94245c1d3b6bee 100644 --- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp @@ -2,13 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include -#include #include "ie_metric_helpers.hpp" -#include -#include +#include +#include +#include + +#include "ie_metric_helpers.hpp" #include #include #include @@ -16,7 +16,6 @@ #include "cldnn_itt.h" #include -#include #include "cldnn_infer_request.h" #include #include "cldnn_async_infer_request.h" @@ -28,7 +27,6 @@ #include "threading/ie_cpu_streams_executor.hpp" #include "cpp_interfaces/interface/ie_iinfer_request_internal.hpp" - using namespace InferenceEngine; using namespace InferenceEngine::details; diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.cpp b/inference-engine/src/cldnn_engine/cldnn_graph.cpp index 04d40c9815d00d..2b333a38ee9d20 100644 --- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp @@ -2,23 +2,28 @@ // SPDX-License-Identifier: Apache-2.0 // +#include +#include + +#include "cldnn_graph.h" +#include "simple_math.h" +#include +#include "cldnn_infer_request.h" + +#include +#include +#include + +#include +#include + #include #include #include #include -#include -#include -#include -#include #include #include #include -#include "cldnn_graph.h" -#include "simple_math.h" -#include -#include -#include "cldnn_infer_request.h" -#include #include #include #include @@ -72,12 +77,10 @@ void CLDNNGraph::Build() { for (int b = m_bv_sz - 1; b >= 0; b--) { auto network = BuildNetwork(m_program->GetCompiledProgram(b)); m_networks.insert(m_networks.begin(), network); - GetEngine()->release_pending_memory(network->get_id()); } } else { auto network = BuildNetwork(m_program->GetCompiledProgram()); m_networks.emplace_back(network); - GetEngine()->release_pending_memory(network->get_id()); } UpdateImplementationsMap(); @@ -500,7 +503,7 @@ void CLDNNGraph::UpdatePerfStatistics() { } }; - std::map executedPrimitives = GetNetwork()->get_executed_primitives(); + std::map executedPrimitives = GetNetwork()->get_executed_primitives(); auto allPrimitives = GetNetwork()->get_all_primitives(); // Get profiling info for all layers @@ -522,7 +525,7 @@ void CLDNNGraph::UpdatePerfStatistics() { auto event = execIter->second; executedPrimitives.erase(execIter); - cldnn::instrumentation::profiling_info cldnnInfo{profiledID, event.get_profiling_info()}; + cldnn::instrumentation::profiling_info cldnnInfo{profiledID, event->get_profiling_info()}; collectTimings(cldnnInfo, perfCount); perfCount.num++; @@ -535,7 +538,7 @@ void CLDNNGraph::UpdatePerfStatistics() { pcIter = perfMap.find(executedID.first); auto& perfCount = pcIter->second.second; - cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second.get_profiling_info()}; + cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()}; collectTimings(cldnnInfo, perfCount); perfCount.num++; @@ -676,7 +679,7 @@ std::map CLDNNGraph::G executedPrimitives.find(primId) != executedPrimitives.end()) { auto event = executedPrimitives.at(primId); - cldnn::instrumentation::profiling_info cldnnInfo{primId, event.get_profiling_info()}; + cldnn::instrumentation::profiling_info cldnnInfo{primId, event->get_profiling_info()}; // Collect timings long long cpuTime = 0; diff --git a/inference-engine/src/cldnn_engine/cldnn_graph.h b/inference-engine/src/cldnn_engine/cldnn_graph.h index 774b159a16c2b9..5ce64712fefa9b 100644 --- a/inference-engine/src/cldnn_engine/cldnn_graph.h +++ b/inference-engine/src/cldnn_engine/cldnn_graph.h @@ -17,8 +17,8 @@ #include "ie_blob.h" #include "cpp/ie_cnn_network.h" -#include -#include +#include +#include #include #include "cldnn_custom_layer.h" @@ -43,7 +43,7 @@ class CLDNNGraph { const Config& getConfig() const { return m_config; } InferenceEngine::gpu::ClContext::Ptr GetContext() { return m_context; } - std::shared_ptr GetEngine() const { return getContextImpl(m_context)->GetEngine(); } + std::shared_ptr GetEngine() const { return getContextImpl(m_context)->GetEngine(); } int GetMaxDynamicBatchSize() const { return getConfig().max_dynamic_batch; } const std::map& GetInputLayouts() const { return m_program->GetInputLayouts(); } size_t GetNetworksCount() const { return m_networks.size(); } diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp index 23f9895970ddf1..bb923f373b9e14 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp @@ -19,7 +19,7 @@ using namespace InferenceEngine; namespace CLDNNPlugin { -const char CLDNNInferRequest::fp32_suffix[] = "_fp32"; +const char fp32_suffix[] = "_fp32"; const char str_not_allocated[] = "Input data was not allocated."; const char cannot_set_compound[] = "cannot set compound blob: supported only for input pre-processing"; const char wrong_nv12_blob[] = "NV12 input blob is expected for input with NV12 color format"; @@ -110,7 +110,7 @@ Blob::Ptr CLDNNInferRequest::createOutputBlob(const TensorDesc& desc, uint8_t* m } } -void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory& inputMem) { +void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory::ptr inputMem) { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::input_attach"); auto impl = getContextImpl(m_graph->GetContext()); impl->acquire_lock(); @@ -127,159 +127,66 @@ void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory& in void CLDNNInferRequest::input_alloc(cldnn::primitive_id name, const cldnn::layout& layout) { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::input_alloc"); - cldnn::memory input_mem = cldnn::memory::allocate(*(m_graph->GetEngine()), layout); + cldnn::memory::ptr input_mem = m_graph->GetEngine()->allocate_memory(layout); input_attach(name, input_mem); } -void CLDNNInferRequest::copyOutputData(const cldnn::memory& outputMemory, - Blob::Ptr bptr, - buf_info* bi) { - OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyOutputData"); - size_t n = (bi == nullptr) ? bptr->size() : bi->buf_size; +template +void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi, cldnn::stream& stream) { + size_t n = (bi == nullptr) ? dst->size() : bi->buf_size; size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; - auto layout = outputMemory.get_layout(); + auto layout = src->get_layout(); auto size = layout.size; - auto l_padd = layout.data_padding.lower_size(); - auto u_padd = layout.data_padding.upper_size(); - - auto h_padding = u_padd.spatial[0] + l_padd.spatial[0]; - auto v_padding_l = (h_padding + size.spatial[0]) * u_padd.spatial[1]; - auto v_padding_u = (h_padding + size.spatial[0]) * l_padd.spatial[1]; - auto locked = bptr->buffer(); - switch (bptr->getTensorDesc().getPrecision()) { - case Precision::FP32: { - auto out_f = locked.as(); - if (out_f == nullptr) { - IE_THROW() << "Invalid output blob"; - } - auto resPtr = outputMemory.pointer(); - float *resVec = out_f + offset; - - if (h_padding || v_padding_l || v_padding_u) { - size_t i = 0; - for (size_t b = 0; b < size.batch[0]; b++) { - for (size_t f = 0; f < size.feature[0]; f++) { - i += v_padding_l; - for (size_t y = 0; y < size.spatial[1]; y++) { - i += l_padd.spatial[0]; - for (size_t x = 0; x < size.spatial[0]; x++, i++) { - *resVec++ = resPtr[i]; - } - i += u_padd.spatial[0]; - } - i += v_padding_u; - } - } - } else { - for (size_t i = 0; i < n; i++) { - resVec[i] = resPtr[i]; - } - } - } - break; - case Precision::FP16: { - auto out_f = locked.as(); - if (out_f == nullptr) { - IE_THROW() << "Invalid output blob"; - } - auto resPtr = outputMemory.pointer(); - uint16_t* resVec = out_f + offset; - - if (h_padding || v_padding_l || v_padding_u) { - size_t i = 0; - for (size_t b = 0; b < size.batch[0]; b++) { - for (size_t f = 0; f < size.feature[0]; f++) { - i += v_padding_l; - for (size_t y = 0; y < size.spatial[1]; y++) { - i += l_padd.spatial[0]; - for (size_t x = 0; x < size.spatial[0]; x++, i++) { - *resVec++ = resPtr[i]; - } - i += u_padd.spatial[0]; - } - i += v_padding_u; - } - } - } else { - for (size_t i = 0; i < n; i++) { - resVec[i] = resPtr[i]; - } - } - } - break; - case Precision::I32: { - auto out_f = locked.as(); - if (out_f == nullptr) { - IE_THROW() << "Invalid output blob"; - } - auto resPtr = outputMemory.pointer(); - int32_t* resVec = out_f + offset; - - if (h_padding || v_padding_l || v_padding_u) { - size_t i = 0; - for (size_t b = 0; b < size.batch[0]; b++) { - for (size_t f = 0; f < size.feature[0]; f++) { - i += v_padding_l; - for (size_t y = 0; y < size.spatial[1]; y++) { - i += l_padd.spatial[0]; - for (size_t x = 0; x < size.spatial[0]; x++, i++) { - *resVec++ = resPtr[i]; + auto locked_dst = dst->buffer(); + auto dst_ptr = locked_dst.as(); + if (dst_ptr == nullptr) { + IE_THROW() << "Invalid output blob"; + } + cldnn::mem_lock src_lock{ src, stream }; + T* src_ptr = src_lock.data(); + dst_ptr += offset; + + if (layout.data_padding) { + for (size_t b = 0; b < size.batch[0]; b++) { + for (size_t f = 0; f < size.feature[0]; f++) { + for (size_t w = 0; w < size.spatial[3]; w++) { + for (size_t z = 0; z < size.spatial[2]; z++) { + for (size_t y = 0; y < size.spatial[1]; y++) { + for (size_t x = 0; x < size.spatial[0]; x++) { + *dst_ptr++ = src_ptr[layout.get_linear_offset(cldnn::tensor(b, f, x, y, z, w))]; + } } - i += u_padd.spatial[0]; } - i += v_padding_u; } } - } else { - for (size_t i = 0; i < n; i++) { - resVec[i] = resPtr[i]; - } } - } - break; - case Precision::I64: { - auto out_f = locked.as(); - if (out_f == nullptr) { - IE_THROW() << "Invalid output blob"; - } - auto resPtr = outputMemory.pointer(); - int64_t* resVec = out_f + offset; - - if (h_padding || v_padding_l || v_padding_u) { - size_t i = 0; - for (size_t b = 0; b < size.batch[0]; b++) { - for (size_t f = 0; f < size.feature[0]; f++) { - i += v_padding_l; - for (size_t y = 0; y < size.spatial[1]; y++) { - i += l_padd.spatial[0]; - for (size_t x = 0; x < size.spatial[0]; x++, i++) { - *resVec++ = resPtr[i]; - } - i += u_padd.spatial[0]; - } - i += v_padding_u; - } - } - } else { - for (size_t i = 0; i < n; i++) { - resVec[i] = resPtr[i]; - } + } else { + for (size_t i = 0; i < n; i++) { + dst_ptr[i] = src_ptr[i]; } } - break; - default: - IE_THROW() << "The plugin does not support output " << bptr->getTensorDesc().getPrecision() << " precision"; +} + +void CLDNNInferRequest::copyOutputData(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) { + OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyOutputData"); + auto& stream = m_graph->GetNetwork()->get_stream(); + switch (dst->getTensorDesc().getPrecision()) { + case Precision::FP32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::FP16: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I64: copyResultToOutputBlob(src, dst, bi, stream); break; + default: IE_THROW(NotImplemented) << "The plugin does not support output " << dst->getTensorDesc().getPrecision() << " precision"; } } void CLDNNInferRequest::copyInputData(std::shared_ptr network, - const cldnn::primitive_id &inputName, - const cldnn::layout& inputLayout, - const Blob &inputBlob, buf_info* bi) { + const cldnn::primitive_id &inputName, + const cldnn::layout& inputLayout, + const Blob &inputBlob, buf_info* bi) { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyInputData"); - size_t n = (bi == nullptr) ? inputBlob.size() : bi->buf_size; + size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; cldnn::primitive_id internalName = "parameter:" + inputName; @@ -287,37 +194,37 @@ void CLDNNInferRequest::copyInputData(std::shared_ptr network, switch (inputBlob.getTensorDesc().getPrecision()) { case Precision::FP32: { float* blob_ptr = const_cast(locked.as()) + offset; - network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::I32: { int32_t* blob_ptr = const_cast(locked.as()) + offset; - network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::I64: { int64_t* blob_ptr = const_cast(locked.as()) + offset; - network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::FP16: { uint16_t* blob_ptr = const_cast(locked.as()) + offset; - network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::I8: { int8_t* blob_ptr = const_cast(locked.as()) + offset; - network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::U8: { uint8_t* blob_ptr = const_cast(locked.as()) + offset; - network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } case Precision::BOOL: { uint8_t* blob_ptr = const_cast(locked.as()) + offset; - network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n)); + network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr)); break; } default: @@ -601,6 +508,7 @@ void CLDNNInferRequest::SetBlob(const std::string& name, const Blob::Ptr &data) void CLDNNInferRequest::AllocateInputs() { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::AllocateInputs"); auto inputLayouts = m_graph->GetInputLayouts(); + auto& stream = m_graph->GetNetwork()->get_stream(); // allocate inputs for (auto& ni : _networkInputs) { std::string name = ni.first; @@ -623,25 +531,24 @@ void CLDNNInferRequest::AllocateInputs() { input_alloc(UVName, inputLayouts.at(UVName)); size_t height = desc.getDims()[2], width = desc.getDims()[3]; - cldnn::pointer input_mem_ptr_Y = inputsMemory.at(YName).pointer(); + cldnn::mem_lock input_mem_ptr_Y{inputsMemory.at(YName), stream}; TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC); auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data()); - cldnn::pointer input_mem_ptr_UV = inputsMemory.at(UVName).pointer(); + cldnn::mem_lock input_mem_ptr_UV{ inputsMemory.at(UVName), stream }; TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC); auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data()); blobs.push_back(make_shared_blob(blobY, blobUV)); } _inputs[name] = desc.getDims()[0] == 1 ? blobs[0] : make_shared_blob(blobs); - } else { if (inputLayouts.find(name) == inputLayouts.end()) { IE_THROW() << "Input layout for " << name << " is not found"; } cldnn::layout layout = inputLayouts.at(name); input_alloc(name, layout); - cldnn::pointer mem_ptr = inputsMemory.at(name).pointer(); + cldnn::mem_lock mem_ptr{inputsMemory.at(name), stream}; _inputs[name] = createInputBlob(desc, mem_ptr.data()); if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { @@ -685,8 +592,8 @@ void CLDNNInferRequest::AllocateOutputs() { bool can_reuse_internal_mem = !m_useStreams; for (auto& no : _networkOutputs) { std::string outputID = m_graph->MapOutputName(no.first); - cldnn::memory output_mem = m_graph->GetNetwork()->get_output_memory(outputID); - cldnn::pointer output_mem_ptr = output_mem.pointer(); + cldnn::memory::ptr output_mem = m_graph->GetNetwork()->get_output_memory(outputID); + cldnn::mem_lock output_mem_ptr{output_mem, m_graph->GetNetwork()->get_stream()}; if (output_mem_ptr.data() == nullptr) { IE_THROW() << "Empty output memory for primitive " << outputID; } @@ -824,6 +731,7 @@ CLDNNInferRequest::CLDNNInferRequest(InputsDataMap networkInputs, OutputsDataMap void CLDNNInferRequest::execAndParse() { OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::execAndParse"); auto networkOutputs = m_graph->GetNetwork()->execute(); + auto& stream = m_graph->GetNetwork()->get_stream(); // Collect outputs as requested by the model for (auto& no : _networkOutputs) { @@ -835,12 +743,12 @@ void CLDNNInferRequest::execAndParse() { // mapping remote blobs not needed - // let the user take care of them explicitly if (!bptr->is()) { - auto out_ptr = outputMemory.pointer(); + cldnn::mem_lock out_ptr{outputMemory, stream}; auto blob_ptr = bptr->buffer().as(); // If Async API is used, copy of output blobs is not needed, unless SetBlob function was called. // But in the case when old API is used we have to copy data to memory provided by user. - if (blob_ptr != &out_ptr[0]) { + if (blob_ptr != out_ptr.data()) { copyOutputData(outputMemory, bptr); } } @@ -965,19 +873,20 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const IE_THROW() << "Input name mismatch."; } auto inputLayout = m_graph->GetInputLayouts().at(inputName); - auto is_same_buffer = [](const Blob& blob, const cldnn::memory& memory) -> bool { + auto is_same_buffer = [&](const Blob& blob, cldnn::memory::ptr memory) -> bool { const std::string str_not_allocated("Input data was not allocated."); - cldnn::pointer ptr = memory.pointer(); + cldnn::mem_lock ptr{memory, m_graph->GetNetwork()->get_stream()}; const uint8_t* blob_ptr = blob.cbuffer().as(); const uint8_t* mem_ptr = ptr.data(); if (blob_ptr == nullptr || mem_ptr == nullptr) { IE_THROW() << str_not_allocated; } - return (blob_ptr == mem_ptr) && (blob.byteSize() == memory.size()); + return (blob_ptr == mem_ptr) && (blob.byteSize() == memory->size()); }; cldnn::primitive_id internalName = "parameter:" + inputName; - const cldnn::memory& memory = inputsMemory.at(inputName); + cldnn::memory::ptr memory = inputsMemory.at(inputName); + auto& stream = m_graph->GetNetwork()->get_stream(); auto _nw_ptr = m_graph->GetNetwork(); auto prec = inputBlob.getTensorDesc().getPrecision(); @@ -986,8 +895,8 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const _nw_ptr->set_input_data(internalName, memory); } else if (prec == Precision::I16 || prec == Precision::U16) { // clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision - const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix); - cldnn::pointer ptr = fp32_mem.pointer(); + cldnn::memory::ptr fp32_mem = inputsMemory.at(inputName+fp32_suffix); + cldnn::mem_lock ptr {fp32_mem, stream}; if (prec == Precision::I16) { copyToFloat(ptr.data(), &inputBlob); } else { @@ -1031,4 +940,4 @@ void CLDNNInferRequest::PrepareInputDyn(const cldnn::primitive_id &inputName, co } } -}; // namespace CLDNNPlugin +} // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/cldnn_infer_request.h b/inference-engine/src/cldnn_engine/cldnn_infer_request.h index f9ec4d94db54f5..a988438e8d6657 100644 --- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h +++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h @@ -46,7 +46,7 @@ class CLDNNInferRequest : public InferenceEngine::IInferRequestInternal { void EnableStreams() { m_useStreams = true; } protected: - std::map inputsMemory; + std::map inputsMemory; std::map outputsMap; bool m_useProfiling; @@ -60,12 +60,12 @@ class CLDNNInferRequest : public InferenceEngine::IInferRequestInternal { InferenceEngine::Blob::Ptr createInputBlob(const InferenceEngine::TensorDesc& desc, uint8_t* mem_ptr = nullptr); InferenceEngine::Blob::Ptr createOutputBlob(const InferenceEngine::TensorDesc& desc, uint8_t* mem_ptr = nullptr); - void copyOutputData(const cldnn::memory& outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr); + void copyOutputData(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr); void copyInputData(std::shared_ptr network, const cldnn::primitive_id &inputName, const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob, buf_info* bi = nullptr); - void input_attach(cldnn::primitive_id name, cldnn::memory& inputMem); + void input_attach(cldnn::primitive_id name, cldnn::memory::ptr inputMem); void input_alloc(cldnn::primitive_id name, const cldnn::layout& layout); void AllocateInputs(); void AllocateOutputs(); @@ -76,9 +76,6 @@ class CLDNNInferRequest : public InferenceEngine::IInferRequestInternal { void PrepareInput(const cldnn::primitive_id &inputName, const InferenceEngine::Blob &inputBlob); void PrepareInputDyn(const cldnn::primitive_id &inputName, const InferenceEngine::Blob &inputBlob); - -private: - static const char fp32_suffix[]; }; }; // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp index d30434397fdedc..6ff0d4ecef3169 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp @@ -92,7 +92,7 @@ bool Program::CanProcessDynBatch(std::vector> ops, return true; } -Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr engine, const Config& config, bool createTopologyOnly) +Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr engine, const Config& config, bool createTopologyOnly) : m_config(config) , m_engine(engine) , m_curBatch(-1) @@ -128,11 +128,9 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr(b)); m_programs.insert(m_programs.begin(), BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly)); - m_engine->release_pending_memory(0); } } else { m_programs.emplace_back(BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly)); - m_engine->release_pending_memory(0); } } diff --git a/inference-engine/src/cldnn_engine/cldnn_program.h b/inference-engine/src/cldnn_engine/cldnn_program.h index a5299d810f9a03..8f90b4fabb7712 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.h +++ b/inference-engine/src/cldnn_engine/cldnn_program.h @@ -15,8 +15,8 @@ #include "cldnn_config.h" -#include -#include +#include +#include // Forward declarations for cldnn part namespace cldnn { @@ -69,8 +69,8 @@ struct PerfCounter { class Program { public: - Program(InferenceEngine::CNNNetwork& network, std::shared_ptr engine, const Config& config, bool createTopologyOnly = false); - Program(std::shared_ptr engine, const Config& config) : m_config(config), m_engine(engine), + Program(InferenceEngine::CNNNetwork& network, std::shared_ptr engine, const Config& config, bool createTopologyOnly = false); + Program(std::shared_ptr engine, const Config& config) : m_config(config), m_engine(engine), m_curBatch(-1), queryMode(false), m_max_batch(1) {} Program() : m_config({}), m_engine(nullptr), m_curBatch(-1), queryMode(false), m_max_batch(1) {} @@ -100,8 +100,8 @@ class Program { const std::map& GetInputLayouts() const { return inputLayouts; } InferenceEngine::InputsDataMap GetNetworkInputs() const { return m_networkInputs; } InferenceEngine::OutputsDataMap GetNetworkOutputs() const { return m_networkOutputs; } - const cldnn::engine& GetEngine() const { return *m_engine; } - std::shared_ptr GetEnginePtr() const { return m_engine; } + cldnn::engine& GetEngine() const { return *m_engine; } + std::shared_ptr GetEnginePtr() const { return m_engine; } const Config& GetConfig() const { return m_config; } int GetMaxBatchSizeForSingleProgram(); @@ -150,7 +150,7 @@ class Program { private: static factories_map_t factories_map; std::vector> m_programs; - std::shared_ptr m_engine; + std::shared_ptr m_engine; Config m_config; std::shared_ptr m_topology; diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp index f03db1c4834e0d..ce52a5eea074bd 100644 --- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp @@ -6,21 +6,23 @@ #include "cldnn_remote_context.h" #include "cldnn_itt.h" +#include "cldnn/runtime/device_query.hpp" + using namespace InferenceEngine; using namespace InferenceEngine::gpu; using namespace InferenceEngine::details; namespace CLDNNPlugin { -static const char unsupported_str[] = "Unsupported shared object type "; CLDNNRemoteAllocator CLDNNRemoteBlobImpl::m_allocator; CLDNNRemoteBlobImpl::CLDNNRemoteBlobImpl(ClContext::Ptr context, + cldnn::stream& stream, const cldnn::layout& layout, cldnn::shared_handle mem, cldnn::shared_surface surf, uint32_t plane, BlobType mem_type) : - m_context(context), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane), + m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane), _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) { } @@ -67,8 +69,7 @@ ParamMap CLDNNRemoteBlobImpl::getParams() const { } bool CLDNNRemoteBlobImpl::deallocate() noexcept { - if (m_memObject != nullptr) - m_memObject.reset(); + m_memObject.reset(); return m_memObject == nullptr; } @@ -86,32 +87,7 @@ void CLDNNRemoteBlobImpl::allocate_if_needed() { _impl->acquire_lock(); if (m_memObject == nullptr) { - auto eng = _impl->GetEngine(); - switch (m_mem_type) { - case BlobType::BT_BUF_INTERNAL: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::allocate(*eng, m_layout))); - break; - case BlobType::BT_BUF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_buffer(*eng, m_layout, m_mem))); - break; -#ifdef _WIN32 - case BlobType::BT_SURF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_mem, m_plane))); - break; - case BlobType::BT_DX_BUF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_dx_buffer(*eng, m_layout, m_mem))); - break; -#else - case BlobType::BT_SURF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_surf, m_plane))); - break; -#endif - case BlobType::BT_IMG_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_image(*eng, m_layout, m_mem))); - break; - default: - IE_THROW() << unsupported_str << m_mem_type; - } + allocate(); } _impl->release_lock(); @@ -120,32 +96,38 @@ void CLDNNRemoteBlobImpl::allocate_if_needed() { void CLDNNRemoteBlobImpl::allocate() noexcept { assert(m_memObject == nullptr); - std::shared_ptr eng = getContextImpl(m_context.lock())->GetEngine(); + std::shared_ptr eng = getContextImpl(m_context.lock())->GetEngine(); switch (m_mem_type) { - case BlobType::BT_BUF_INTERNAL: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::allocate(*eng, m_layout))); + case BlobType::BT_BUF_INTERNAL: { + m_memObject = eng->allocate_memory(m_layout); break; - case BlobType::BT_BUF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_buffer(*eng, m_layout, m_mem))); + } + case BlobType::BT_BUF_SHARED: { + m_memObject = eng->share_buffer(m_layout, m_mem); break; + } #ifdef _WIN32 - case BlobType::BT_SURF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_mem, m_plane))); + case BlobType::BT_SURF_SHARED: { + m_memObject = eng->share_surface(m_layout, m_mem, m_plane); break; - case BlobType::BT_DX_BUF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_dx_buffer(*eng, m_layout, m_mem))); + } + case BlobType::BT_DX_BUF_SHARED: { + m_memObject = eng->share_dx_buffer(m_layout, m_mem); break; + } #else - case BlobType::BT_SURF_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_surf, m_plane))); + case BlobType::BT_SURF_SHARED: { + m_memObject = eng->share_surface(m_layout, m_surf, m_plane); break; + } #endif - case BlobType::BT_IMG_SHARED: - m_memObject = std::unique_ptr(new cldnn::memory(cldnn::memory::share_image(*eng, m_layout, m_mem))); + case BlobType::BT_IMG_SHARED: { + m_memObject = eng->share_image(m_layout, m_mem); break; + } default: - m_memObject = nullptr; + m_memObject.reset(); } } @@ -165,7 +147,7 @@ std::shared_ptr CLDNNRemoteBlobImpl::getContext() const noexcept } void CLDNNRemoteBlobImpl::lock() const { - lockedHolder = std::unique_ptr>(new cldnn::pointer(m_memObject->pointer())); + lockedHolder = std::unique_ptr>(new cldnn::mem_lock(m_memObject, m_stream)); auto ptr = lockedHolder->data(); _handle = reinterpret_cast(ptr); m_allocator.regLockedBlob(_handle, this); @@ -244,7 +226,11 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr(dev, - cldnn::engine_configuration((m_config.useProfiling || + bool enable_profiling = (m_config.useProfiling || (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_tune_and_cache) || - (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache)), - false, - m_config.dumpCustomKernels, - std::string(), - std::string(), - true, - std::string(), - m_config.sources_dumps_dir, - m_config.queuePriority, - m_config.queueThrottle, - m_config.memory_pool_on, - m_config.throughput_streams, - m_config.kernels_cache_dir, - m_config.n_threads)); + (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache)); + cldnn::queue_types queue_type = cldnn::queue_types::out_of_order; + bool use_unified_shared_memory = true; + m_engine = cldnn::engine::create(engine_type, runtime_type, dev, cldnn::engine_configuration(enable_profiling, + queue_type, + m_config.sources_dumps_dir, + m_config.queuePriority, + m_config.queueThrottle, + m_config.memory_pool_on, + use_unified_shared_memory, + m_config.kernels_cache_dir, + m_config.n_threads)); } } ParamMap CLDNNExecutionContextImpl::getParams() const { - ParamMap ret = { { GPU_PARAM_KEY(OCL_CONTEXT), m_engine->get_context() } }; + ParamMap ret = { { GPU_PARAM_KEY(OCL_CONTEXT), m_engine->get_user_context() } }; switch (m_type) { case OCL: diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.h b/inference-engine/src/cldnn_engine/cldnn_remote_context.h index a2ce1729bd10fc..f6a92e82c48764 100644 --- a/inference-engine/src/cldnn_engine/cldnn_remote_context.h +++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.h @@ -4,15 +4,11 @@ #pragma once -#include -#include -#include -#include +#include +#include #include #include #include "cldnn_config.h" -#include -#include #include "cldnn_common_utils.h" #ifndef NOMINMAX @@ -25,6 +21,11 @@ # include #endif +#include +#include +#include +#include + namespace CLDNNPlugin { class CLDNNRemoteAllocator; @@ -41,6 +42,7 @@ class CLDNNRemoteBlobImpl : public InferenceEngine::gpu::details::param_map_obj_ }; explicit CLDNNRemoteBlobImpl(InferenceEngine::gpu::ClContext::Ptr context, + cldnn::stream& stream, const cldnn::layout& layout, cldnn::shared_handle mem, cldnn::shared_surface surf, @@ -63,11 +65,12 @@ class CLDNNRemoteBlobImpl : public InferenceEngine::gpu::details::param_map_obj_ bool is_allocated() const noexcept; bool is_locked() const noexcept; void allocate_if_needed(); - cldnn::memory& getMemory() { return *m_memObject; } + cldnn::memory::ptr getMemory() { return m_memObject; } protected: static CLDNNRemoteAllocator m_allocator; std::weak_ptr m_context; + cldnn::stream& m_stream; // constructor stuff cldnn::shared_handle m_mem; @@ -77,9 +80,9 @@ class CLDNNRemoteBlobImpl : public InferenceEngine::gpu::details::param_map_obj_ cldnn::layout m_layout; BlobType m_mem_type; - std::unique_ptr m_memObject; + cldnn::memory::ptr m_memObject; - mutable std::unique_ptr> lockedHolder; + mutable std::unique_ptr> lockedHolder; mutable void* _handle; mutable std::shared_ptr _allocator; @@ -93,13 +96,14 @@ class typedCLDNNRemoteBlob : public TpublicAPI { using Ptr = std::shared_ptr; explicit typedCLDNNRemoteBlob(InferenceEngine::gpu::ClContext::Ptr context, + cldnn::stream& stream, const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, cldnn::shared_handle mem, cldnn::shared_surface surf, uint32_t plane, CLDNNRemoteBlobImpl::BlobType mem_type) - : _impl(context, layout, mem, surf, plane, mem_type) + : _impl(context, stream, layout, mem, surf, plane, mem_type) , TpublicAPI(desc) {} void allocate() noexcept override { _impl.allocate(); } @@ -231,6 +235,7 @@ class CLDNNExecutionContextImpl : public InferenceEngine::gpu::details::param_ma } protected: + // TODO: refactor to unique_ptr std::shared_ptr m_engine; InferenceEngine::gpu_handle_param m_va_display; Config m_config; @@ -267,6 +272,7 @@ class typedCLDNNExecutionContext : public TpublicContextAPI, using namespace InferenceEngine; using InferenceEngine::gpu::details::param_map_obj_getter; InferenceEngine::RemoteBlob::Ptr ret = nullptr; + auto& stream = _impl.GetEngine()->get_program_stream(); uint32_t plane = param_map_obj_getter::_ObjFromParamSimple(params, GPU_PARAM_KEY(VA_PLANE)); #ifdef _WIN32 cldnn::shared_handle mem = param_map_obj_getter::_ObjFromParamSimple(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE)); @@ -290,11 +296,11 @@ class typedCLDNNExecutionContext : public TpublicContextAPI, std::dynamic_pointer_cast (std::enable_shared_from_this>::shared_from_this()); #ifdef _WIN32 - ret = std::make_shared(smart_this, + ret = std::make_shared(smart_this, stream, tensorDesc, layout, mem, 0, plane, CLDNNRemoteBlobImpl::BlobType::BT_SURF_SHARED); #else - ret = std::make_shared(smart_this, + ret = std::make_shared(smart_this, stream, tensorDesc, layout, nullptr, surf, plane, CLDNNRemoteBlobImpl::BlobType::BT_SURF_SHARED); #endif @@ -311,6 +317,7 @@ class typedCLDNNExecutionContext : public TpublicContextAPI, InferenceEngine::RemoteBlob::Ptr ret = nullptr; _impl.acquire_lock(); + auto& stream = _impl.GetEngine()->get_program_stream(); // try to locate previously shared object auto itr = shared_obj_reg.find(mem); @@ -327,15 +334,15 @@ class typedCLDNNExecutionContext : public TpublicContextAPI, switch (blob_type) { case CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED: - ret = std::make_shared(smart_this, tensorDesc, layout, mem, 0, 0, blob_type); + ret = std::make_shared(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type); break; case CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED: layout.format = ImageFormatFromLayout(tensorDesc.getLayout()); - ret = std::make_shared(smart_this, tensorDesc, layout, mem, 0, 0, blob_type); + ret = std::make_shared(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type); break; #ifdef _WIN32 case CLDNNRemoteBlobImpl::BlobType::BT_DX_BUF_SHARED: - ret = std::make_shared(smart_this, tensorDesc, layout, mem, 0, 0, blob_type); + ret = std::make_shared(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type); break; #endif default: @@ -354,7 +361,9 @@ class typedCLDNNExecutionContext : public TpublicContextAPI, CldnnTensorFromIEDims(tensorDesc.getDims())); auto smart_this = std::dynamic_pointer_cast (std::enable_shared_from_this>::shared_from_this()); + auto& stream = _impl.GetEngine()->get_program_stream(); return std::make_shared(smart_this, + stream, tensorDesc, layout, nullptr, 0, 0, diff --git a/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp b/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp index 51d70e05f1a525..e46643f0617ca9 100644 --- a/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp +++ b/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/batch_to_space.hpp" #include "ngraph/op/constant.hpp" -#include "api/batch_to_space.hpp" +#include "cldnn/primitives/batch_to_space.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/broadcast.cpp b/inference-engine/src/cldnn_engine/ops/broadcast.cpp index af0ed9b7b751da..6d6e63032005c3 100644 --- a/inference-engine/src/cldnn_engine/ops/broadcast.cpp +++ b/inference-engine/src/cldnn_engine/ops/broadcast.cpp @@ -8,9 +8,9 @@ #include "ngraph/op/broadcast.hpp" #include "ngraph/op/constant.hpp" -#include "api/broadcast.hpp" -#include "api/reorder.hpp" -#include "api/reshape.hpp" +#include "cldnn/primitives/broadcast.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/reshape.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/concat.cpp b/inference-engine/src/cldnn_engine/ops/concat.cpp index 5a300c3dc8f394..9d37f959f03fb7 100644 --- a/inference-engine/src/cldnn_engine/ops/concat.cpp +++ b/inference-engine/src/cldnn_engine/ops/concat.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/concat.hpp" -#include "api/concatenation.hpp" +#include "cldnn/primitives/concatenation.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/constant.cpp b/inference-engine/src/cldnn_engine/ops/constant.cpp index e8630f67a6bc31..fea42f31d98227 100644 --- a/inference-engine/src/cldnn_engine/ops/constant.cpp +++ b/inference-engine/src/cldnn_engine/ops/constant.cpp @@ -17,7 +17,7 @@ #include "ngraph/op/variadic_split.hpp" #include "ngraph/op/util/op_types.hpp" -#include "api/data.hpp" +#include "cldnn/primitives/data.hpp" namespace CLDNNPlugin { @@ -169,9 +169,10 @@ void CreateConstantOp(Program& p, const std::shared_ptrsecond; } else { - auto mem = cldnn::memory::allocate(p.GetEngine(), constLayout, 0, false); - auto tmpPointer = mem.pointer(); // implicitly maps buffer - unmap in destructor - auto buf = tmpPointer.data(); + cldnn::memory::ptr mem = p.GetEngine().allocate_memory(constLayout, false); + auto& stream = p.GetEngine().get_program_stream(); + cldnn::mem_lock lock{mem, stream}; + auto buf = lock.data(); auto bufSize = constLayout.bytes_count(); // Do actual weights reorder and change O and I channels order diff --git a/inference-engine/src/cldnn_engine/ops/convert.cpp b/inference-engine/src/cldnn_engine/ops/convert.cpp index 29fb037258f092..6af5bee759d683 100644 --- a/inference-engine/src/cldnn_engine/ops/convert.cpp +++ b/inference-engine/src/cldnn_engine/ops/convert.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/convert.hpp" #include "ngraph/op/convert_like.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/reorder.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/convolution.cpp b/inference-engine/src/cldnn_engine/ops/convolution.cpp index 3207dab623aefa..83f536a68b72fb 100644 --- a/inference-engine/src/cldnn_engine/ops/convolution.cpp +++ b/inference-engine/src/cldnn_engine/ops/convolution.cpp @@ -13,11 +13,11 @@ #include "ngraph/op/fake_quantize.hpp" #include "ngraph/op/util/op_types.hpp" -#include "api/convolution.hpp" -#include "api/deconvolution.hpp" -#include "api/binary_convolution.hpp" -#include "api/permute.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/convolution.hpp" +#include "cldnn/primitives/deconvolution.hpp" +#include "cldnn/primitives/binary_convolution.hpp" +#include "cldnn/primitives/permute.hpp" +#include "cldnn/primitives/reorder.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp b/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp index 2887360724535e..c8bd8d54e078ea 100644 --- a/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp +++ b/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp @@ -8,9 +8,9 @@ #include "ngraph/op/ctc_greedy_decoder.hpp" #include "ngraph/op/ctc_greedy_decoder_seq_len.hpp" -#include "api/ctc_greedy_decoder.hpp" -#include "api/reorder.hpp" -#include "api/mutable_data.hpp" +#include "cldnn/primitives/ctc_greedy_decoder.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/mutable_data.hpp" #include "transformations/utils/utils.hpp" @@ -58,7 +58,7 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptrget_output_size(); - std::vector shared_memory; + std::vector shared_memory; if (num_output == 2) { auto mutable_precision = op->get_output_element_type(1); if (mutable_precision == ngraph::element::i64) { @@ -70,7 +70,7 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptrget_output_shape(1).size()), CldnnTensorFromIEDims(op->get_output_shape(1))); - shared_memory.emplace_back(cldnn::memory::allocate(p.GetEngine(), mutableLayout)); + shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayout)); cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write"; auto ctc_gd_mutable_prim = cldnn::mutable_data(ctc_gd_mutable_id_w, shared_memory[0]); diff --git a/inference-engine/src/cldnn_engine/ops/cum_sum.cpp b/inference-engine/src/cldnn_engine/ops/cum_sum.cpp index 6252a258269899..1bdcec2957eda5 100644 --- a/inference-engine/src/cldnn_engine/ops/cum_sum.cpp +++ b/inference-engine/src/cldnn_engine/ops/cum_sum.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/cum_sum.hpp" #include "ngraph/op/constant.hpp" -#include "api/cum_sum.hpp" +#include "cldnn/primitives/cum_sum.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/custom.cpp b/inference-engine/src/cldnn_engine/ops/custom.cpp index 5211fc94efd3aa..85945bfbdb9698 100644 --- a/inference-engine/src/cldnn_engine/ops/custom.cpp +++ b/inference-engine/src/cldnn_engine/ops/custom.cpp @@ -9,8 +9,8 @@ #include "ngraph/attribute_visitor.hpp" #include "ngraph/node.hpp" -#include "api/custom_gpu_primitive.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/custom_gpu_primitive.hpp" +#include "cldnn/primitives/reorder.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp b/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp index 4c407a126d50e0..b53262ab23dec7 100644 --- a/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp +++ b/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/depth_to_space.hpp" -#include "api/depth_to_space.hpp" +#include "cldnn/primitives/depth_to_space.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/detection_output.cpp b/inference-engine/src/cldnn_engine/ops/detection_output.cpp index 8bbe102bcfd05d..aa2b505f0e76df 100644 --- a/inference-engine/src/cldnn_engine/ops/detection_output.cpp +++ b/inference-engine/src/cldnn_engine/ops/detection_output.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/detection_output.hpp" -#include "api/detection_output.hpp" +#include "cldnn/primitives/detection_output.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/eltwise.cpp b/inference-engine/src/cldnn_engine/ops/eltwise.cpp index 66dcca7b459c47..817512d5bf92ec 100644 --- a/inference-engine/src/cldnn_engine/ops/eltwise.cpp +++ b/inference-engine/src/cldnn_engine/ops/eltwise.cpp @@ -25,10 +25,10 @@ #include "ngraph/op/power.hpp" #include "ngraph/op/floor_mod.hpp" -#include "api/activation.hpp" -#include "api/eltwise.hpp" -#include "api/reorder.hpp" -#include "api/reshape.hpp" +#include "cldnn/primitives/activation.hpp" +#include "cldnn/primitives/eltwise.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/reshape.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp b/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp index 35b9f00096d009..2e97a60aebf8be 100644 --- a/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp +++ b/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp @@ -9,8 +9,8 @@ #include "ngraph/op/embeddingbag_offsets_sum.hpp" #include "ngraph/op/embeddingbag_packedsum.hpp" -#include "api/embedding_bag.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/embedding_bag.hpp" +#include "cldnn/primitives/reorder.hpp" #include "transformations/utils/utils.hpp" diff --git a/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp b/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp index 42f7c4e8e99666..23b5f01432001a 100644 --- a/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp +++ b/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/extractimagepatches.hpp" -#include "api/extract_image_patches.hpp" +#include "cldnn/primitives/extract_image_patches.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp b/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp index ca3d950694c4f3..345a70f34bbce3 100644 --- a/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp +++ b/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/fake_quantize.hpp" -#include "api/quantize.hpp" +#include "cldnn/primitives/quantize.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/gather tree.cpp b/inference-engine/src/cldnn_engine/ops/gather tree.cpp index a6c806bcbe8c1b..6b73131fd29d84 100644 --- a/inference-engine/src/cldnn_engine/ops/gather tree.cpp +++ b/inference-engine/src/cldnn_engine/ops/gather tree.cpp @@ -7,8 +7,8 @@ #include "ngraph/op/gather_tree.hpp" -#include "api/gather_tree.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/gather_tree.hpp" +#include "cldnn/primitives/reorder.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/gather.cpp b/inference-engine/src/cldnn_engine/ops/gather.cpp index b80e26661e5353..362854cc32aaeb 100644 --- a/inference-engine/src/cldnn_engine/ops/gather.cpp +++ b/inference-engine/src/cldnn_engine/ops/gather.cpp @@ -7,8 +7,8 @@ #include "ngraph/op/gather.hpp" -#include "api/gather.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/gather.hpp" +#include "cldnn/primitives/reorder.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/gather_nd.cpp b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp index 6a1cd65132928e..cbdc5659bb3197 100644 --- a/inference-engine/src/cldnn_engine/ops/gather_nd.cpp +++ b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/gather_nd.hpp" #include "ngraph/op/constant.hpp" -#include "api/gather_nd.hpp" +#include "cldnn/primitives/gather_nd.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/grn.cpp b/inference-engine/src/cldnn_engine/ops/grn.cpp index 3eb750f85a920c..960dd03494735d 100644 --- a/inference-engine/src/cldnn_engine/ops/grn.cpp +++ b/inference-engine/src/cldnn_engine/ops/grn.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/grn.hpp" -#include "api/grn.hpp" +#include "cldnn/primitives/grn.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/interpolate.cpp b/inference-engine/src/cldnn_engine/ops/interpolate.cpp index f9241b8ca0fd75..b77999289c631b 100644 --- a/inference-engine/src/cldnn_engine/ops/interpolate.cpp +++ b/inference-engine/src/cldnn_engine/ops/interpolate.cpp @@ -9,7 +9,7 @@ #include "ngraph/op/interpolate.hpp" #include "ngraph/op/constant.hpp" -#include "api/resample.hpp" +#include "cldnn/primitives/resample.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/lrn.cpp b/inference-engine/src/cldnn_engine/ops/lrn.cpp index f8e7c601645cec..c13c17daaeb13d 100644 --- a/inference-engine/src/cldnn_engine/ops/lrn.cpp +++ b/inference-engine/src/cldnn_engine/ops/lrn.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/lrn.hpp" #include "ngraph/op/constant.hpp" -#include "api/lrn.hpp" +#include "cldnn/primitives/lrn.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/matmul.cpp b/inference-engine/src/cldnn_engine/ops/matmul.cpp index b18e199581cd15..a8818c9e6f67ee 100644 --- a/inference-engine/src/cldnn_engine/ops/matmul.cpp +++ b/inference-engine/src/cldnn_engine/ops/matmul.cpp @@ -9,11 +9,11 @@ #include "ngraph/op/constant.hpp" #include "ngraph/op/fake_quantize.hpp" -#include "api/gemm.hpp" -#include "api/fully_connected.hpp" -#include "api/reshape.hpp" -#include "api/reorder.hpp" -#include "api/permute.hpp" +#include "cldnn/primitives/gemm.hpp" +#include "cldnn/primitives/fully_connected.hpp" +#include "cldnn/primitives/reshape.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/permute.hpp" namespace CLDNNPlugin { @@ -83,10 +83,11 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o for (auto o = transpose_order.size(); o < 4; o++) transpose_order.push_back((uint16_t)o); + std::vector cldnn_permute_order = ConvertPermuteOrder(transpose_order); auto permuteName = op->get_friendly_name() + "/transpose_b"; auto permutePrim = cldnn::permute(permuteName, weightsName, - transpose_order); + cldnn_permute_order); p.AddPrimitive(permutePrim); p.AddInnerPrimitiveToProfiler(permuteName, layerName, op); weightsName = permuteName; @@ -102,10 +103,11 @@ void CreateMatMulOp(Program& p, const std::shared_ptr& o for (auto o = transpose_order.size(); o < 4; o++) transpose_order.push_back((uint16_t)o); + std::vector cldnn_permute_order = ConvertPermuteOrder(transpose_order); auto permuteName = op->get_friendly_name() + "/transpose_a"; auto permutePrim = cldnn::permute(permuteName, inputName, - transpose_order); + cldnn_permute_order); p.AddPrimitive(permutePrim); p.AddInnerPrimitiveToProfiler(permuteName, layerName, op); inputName = permuteName; diff --git a/inference-engine/src/cldnn_engine/ops/mvn.cpp b/inference-engine/src/cldnn_engine/ops/mvn.cpp index 79001b812e1854..b9cb376a24e34e 100644 --- a/inference-engine/src/cldnn_engine/ops/mvn.cpp +++ b/inference-engine/src/cldnn_engine/ops/mvn.cpp @@ -8,7 +8,8 @@ #include "ngraph/op/mvn.hpp" #include "ngraph/op/constant.hpp" -#include "api/mvn.hpp" +#include "cldnn/primitives/mvn.hpp" + #include namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp b/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp index 8b124309467e10..a3d4834c51eed1 100644 --- a/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp +++ b/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp @@ -9,9 +9,9 @@ #include #include -#include "api/reorder.hpp" -#include "api/mutable_data.hpp" -#include "api/non_max_suppression.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/mutable_data.hpp" +#include "cldnn/primitives/non_max_suppression.hpp" namespace CLDNNPlugin { @@ -62,7 +62,7 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptrget_output_size(); - std::vector shared_memory; + std::vector shared_memory; switch (num_output) { case 3: { auto mutable_precision_second = op->get_output_element_type(2); @@ -74,7 +74,7 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptrget_output_shape(2).size()), CldnnTensorFromIEDims(op->get_output_shape(2))); - shared_memory.emplace_back(cldnn::memory::allocate(p.GetEngine(), mutableLayoutSecond)); + shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond)); cldnn::primitive_id non_max_supression_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second"; auto nms_mutable_prim_second = cldnn::mutable_data(non_max_supression_mutable_id_w_second, shared_memory.back()); @@ -91,7 +91,7 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptr(op->get_output_element_type(0), ngraph::Shape{1}, std::vector{1.0}); cldnn::layout constLayout = cldnn::layout(DataTypeFromPrecision(op->get_output_element_type(0)), cldnn::format::bfyx, cldnn::tensor{1}); - auto mem = cldnn::memory::allocate(p.GetEngine(), constLayout, 0, false); - auto tmpPointer = mem.pointer(); // implicitly maps buffer - unmap in destructor + auto mem = p.GetEngine().allocate_memory(constLayout, false); + cldnn::mem_lock tmpPointer{mem, p.GetEngine().get_program_stream()}; auto buf = tmpPointer.data(); auto bufSize = scale->get_output_tensor(0).size(); diff --git a/inference-engine/src/cldnn_engine/ops/one_hot.cpp b/inference-engine/src/cldnn_engine/ops/one_hot.cpp index 1076bf595efb0f..3d792bda8aee0e 100644 --- a/inference-engine/src/cldnn_engine/ops/one_hot.cpp +++ b/inference-engine/src/cldnn_engine/ops/one_hot.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/one_hot.hpp" -#include "api/one_hot.hpp" +#include "cldnn/primitives/one_hot.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/pad.cpp b/inference-engine/src/cldnn_engine/ops/pad.cpp index a3503318a77f79..0d409414b58776 100644 --- a/inference-engine/src/cldnn_engine/ops/pad.cpp +++ b/inference-engine/src/cldnn_engine/ops/pad.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/pad.hpp" -#include "api/border.hpp" +#include "cldnn/primitives/border.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/parameter.cpp b/inference-engine/src/cldnn_engine/ops/parameter.cpp index 7c61d8261d7881..b68593dd0a5b0f 100644 --- a/inference-engine/src/cldnn_engine/ops/parameter.cpp +++ b/inference-engine/src/cldnn_engine/ops/parameter.cpp @@ -7,10 +7,10 @@ #include "ngraph/op/parameter.hpp" -#include "api/input_layout.hpp" -#include "api/reorder.hpp" -#include "api/data.hpp" -#include "api/concatenation.hpp" +#include "cldnn/primitives/input_layout.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/data.hpp" +#include "cldnn/primitives/concatenation.hpp" using namespace InferenceEngine; @@ -158,8 +158,8 @@ void CreateParameterOp(Program& p, const std::shared_ptrsecond; } else { - auto mem = cldnn::memory::allocate(p.GetEngine(), meanBlobLayout, 0, false); - auto tmpPointer = mem.pointer(); // implicitly maps buffer - unmap in destructor + auto mem = p.GetEngine().allocate_memory(meanBlobLayout, false); + cldnn::mem_lock tmpPointer{ mem, p.GetEngine().get_program_stream() }; auto buf = tmpPointer.data(); auto bufSize = meanBlobLayout.bytes_count(); diff --git a/inference-engine/src/cldnn_engine/ops/pooling.cpp b/inference-engine/src/cldnn_engine/ops/pooling.cpp index 16ca93a6879d3c..f1bf6952292056 100644 --- a/inference-engine/src/cldnn_engine/ops/pooling.cpp +++ b/inference-engine/src/cldnn_engine/ops/pooling.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/max_pool.hpp" #include "ngraph/op/avg_pool.hpp" -#include "api/pooling.hpp" +#include "cldnn/primitives/pooling.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/prior_box.cpp b/inference-engine/src/cldnn_engine/ops/prior_box.cpp index 07c6a4ca3ee240..6cf0aaa65355d3 100644 --- a/inference-engine/src/cldnn_engine/ops/prior_box.cpp +++ b/inference-engine/src/cldnn_engine/ops/prior_box.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/prior_box.hpp" #include "ngraph/op/prior_box_clustered.hpp" -#include "api/prior_box.hpp" +#include "cldnn/primitives/prior_box.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/proposal.cpp b/inference-engine/src/cldnn_engine/ops/proposal.cpp index 70ff3d154f107d..d5b906e5e6e057 100644 --- a/inference-engine/src/cldnn_engine/ops/proposal.cpp +++ b/inference-engine/src/cldnn_engine/ops/proposal.cpp @@ -7,8 +7,8 @@ #include "ngraph/op/proposal.hpp" -#include "api/proposal.hpp" -#include "api/mutable_data.hpp" +#include "cldnn/primitives/proposal.hpp" +#include "cldnn/primitives/mutable_data.hpp" namespace CLDNNPlugin { @@ -62,7 +62,7 @@ void CreateProposalOp(Program& p, const std::shared_ptrget_output_shape(1).size()), CldnnTensorFromIEDims(op->get_output_shape(1))); - auto shared_memory = cldnn::memory::allocate(p.GetEngine(), mutableLayout); + auto shared_memory = p.GetEngine().allocate_memory(mutableLayout); cldnn::primitive_id proposal_mutable_id_w = layer_type_name_ID(op) + "_md_write"; auto argmax_mutable_prim = cldnn::mutable_data(proposal_mutable_id_w, shared_memory); diff --git a/inference-engine/src/cldnn_engine/ops/reduce.cpp b/inference-engine/src/cldnn_engine/ops/reduce.cpp deleted file mode 100644 index 26343ffb813129..00000000000000 --- a/inference-engine/src/cldnn_engine/ops/reduce.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "cldnn_program.h" -#include "cldnn_common_utils.h" - -#include "ngraph/op/reduce_sum.hpp" -#include "ngraph/op/reduce_prod.hpp" -#include "ngraph/op/reduce_mean.hpp" -#include "ngraph/op/reduce_logical_or.hpp" -#include "ngraph/op/reduce_logical_and.hpp" -#include "ngraph/op/reduce_l1.hpp" -#include "ngraph/op/reduce_l2.hpp" -#include "ngraph/op/min.hpp" -#include "ngraph/op/max.hpp" -#include "ngraph/op/constant.hpp" - -#include "api/reduce.hpp" -#include "api/reorder.hpp" -#include "api/reshape.hpp" - -namespace CLDNNPlugin { - -void CreateReduceOp(Program& p, const std::shared_ptr& op, cldnn::reduce_mode mode, bool keep_dims) { - p.ValidateInputs(op, {2}); - auto inputPrimitives = p.GetInputPrimitiveIDs(op); - std::string layerName = layer_type_name_ID(op); - - size_t rank = op->get_input_shape(0).size(); - - auto axes_constant = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(1)); - if (!axes_constant) { - IE_THROW() << "Unsupported parameter nodes type in " << op->get_friendly_name() << " (" << op->get_type_name() << ")"; - } - std::vector rawAxes = axes_constant->cast_vector(); - - std::vector axes; - for (size_t a = 0; a < rawAxes.size(); a++) { - if (rawAxes[a] < 0) - rawAxes[a] = rawAxes[a] + rank; - if (rawAxes[a] < 0 || rawAxes[a] > rank - 1) - IE_THROW() << op->get_friendly_name() << " Incorrect Reduce axis value: " << rawAxes[a]; - if (rank == 6) { - switch (rawAxes[a]) { - case 0: axes.push_back(cldnn::reduce::along_b); break; - case 1: axes.push_back(cldnn::reduce::along_f); break; - case 2: axes.push_back(cldnn::reduce::along_w); break; - case 3: axes.push_back(cldnn::reduce::along_z); break; - case 4: axes.push_back(cldnn::reduce::along_y); break; - case 5: axes.push_back(cldnn::reduce::along_x); break; - } - } else if (rank == 5) { - switch (rawAxes[a]) { - case 0: axes.push_back(cldnn::reduce::along_b); break; - case 1: axes.push_back(cldnn::reduce::along_f); break; - case 2: axes.push_back(cldnn::reduce::along_z); break; - case 3: axes.push_back(cldnn::reduce::along_y); break; - case 4: axes.push_back(cldnn::reduce::along_x); break; - } - } else { - switch (rawAxes[a]) { - case 0: axes.push_back(cldnn::reduce::along_b); break; - case 1: axes.push_back(cldnn::reduce::along_f); break; - case 2: axes.push_back(cldnn::reduce::along_y); break; - case 3: axes.push_back(cldnn::reduce::along_x); break; - } - } - } - - sort(axes.begin(), axes.end()); - axes.erase(unique(axes.begin(), axes.end()), axes.end()); - - auto reducePrim = cldnn::reduce(layerName, - inputPrimitives[0], - mode, - axes, - static_cast(keep_dims)); - - p.AddPrimitive(reducePrim); - - auto resultLayerName = layerName; - auto out_dims = op->get_output_shape(0).size(); - if (out_dims == 3 && !keep_dims && rank >= 4) { - resultLayerName = layerName + "_reshape"; - auto out_shape = op->get_output_shape(0); - cldnn::tensor outTensor; - switch (rank) { - case 6: - outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]), - 1, TensorValue(out_shape[2]), 1, 1); - case 5: - outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]), - 1, TensorValue(out_shape[2]), 1); - case 4: - outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]), - 1, TensorValue(out_shape[2])); - } - auto reshape_prim = cldnn::reshape(resultLayerName, layerName, outTensor); - p.AddPrimitive(reshape_prim); - p.AddPrimitiveToProfiler(op, resultLayerName); - } - - auto reorderLayerName = layerName + "_reorder"; - cldnn::format out_format = cldnn::format::any; - auto out_dt = DataTypeFromPrecision(op->get_output_element_type(0)); - if (!keep_dims && rank > 4) { - if (rank - rawAxes.size() == 6) - out_format = cldnn::format::bfwzyx; - else if (rank - rawAxes.size() == 5) - out_format = cldnn::format::bfzyx; - else if (rank - rawAxes.size() <= 4) - out_format = cldnn::format::bfyx; - - auto reorder_prim = cldnn::reorder(reorderLayerName, resultLayerName, out_format, out_dt); - p.AddPrimitive(reorder_prim); - p.AddPrimitiveToProfiler(op, reorderLayerName); - } else { - p.AddPrimitiveToProfiler(op); - } -} - -void CreateReduceMaxOp(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::max, op->get_keep_dims()); -} - -void CreateReduceLogicalAndOp(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::logical_and, op->get_keep_dims()); -} - -void CreateReduceLogicalOrOp(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::logical_or, op->get_keep_dims()); -} - -void CreateReduceMeanOp(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::mean, op->get_keep_dims()); -} - -void CreateReduceMinOp(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::min, op->get_keep_dims()); -} - -void CreateReduceProdOp(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::prod, op->get_keep_dims()); -} - -void CreateReduceSumOp(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::sum, op->get_keep_dims()); -} - -void CreateReduceL1Op(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::l1, op->get_keep_dims()); -} - -void CreateReduceL2Op(Program& p, const std::shared_ptr& op) { - CreateReduceOp(p, op, cldnn::reduce_mode::l2, op->get_keep_dims()); -} - -REGISTER_FACTORY_IMPL(v1, ReduceMax); -REGISTER_FACTORY_IMPL(v1, ReduceLogicalAnd); -REGISTER_FACTORY_IMPL(v1, ReduceLogicalOr); -REGISTER_FACTORY_IMPL(v1, ReduceMean); -REGISTER_FACTORY_IMPL(v1, ReduceMin); -REGISTER_FACTORY_IMPL(v1, ReduceProd); -REGISTER_FACTORY_IMPL(v1, ReduceSum); -REGISTER_FACTORY_IMPL(v4, ReduceL1); -REGISTER_FACTORY_IMPL(v4, ReduceL2); - -} // namespace CLDNNPlugin diff --git a/inference-engine/src/cldnn_engine/ops/region_yolo.cpp b/inference-engine/src/cldnn_engine/ops/region_yolo.cpp index 6e5fd660e3fa91..348dd0f7eeb581 100644 --- a/inference-engine/src/cldnn_engine/ops/region_yolo.cpp +++ b/inference-engine/src/cldnn_engine/ops/region_yolo.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/region_yolo.hpp" -#include "api/region_yolo.hpp" +#include "cldnn/primitives/region_yolo.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp b/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp index 7ddc45221a6cfb..4a7f54cf810bc2 100644 --- a/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp +++ b/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/reorg_yolo.hpp" -#include "api/reorg_yolo.hpp" +#include "cldnn/primitives/reorg_yolo.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/reshape.cpp b/inference-engine/src/cldnn_engine/ops/reshape.cpp index b2111e77f64aa7..f0084bb6a1cb21 100644 --- a/inference-engine/src/cldnn_engine/ops/reshape.cpp +++ b/inference-engine/src/cldnn_engine/ops/reshape.cpp @@ -9,8 +9,8 @@ #include "ngraph/op/squeeze.hpp" #include "ngraph/op/unsqueeze.hpp" -#include "api/reshape.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/reshape.hpp" +#include "cldnn/primitives/reorder.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/result.cpp b/inference-engine/src/cldnn_engine/ops/result.cpp index 4974f1ec56cdc7..fe0d0f05658018 100644 --- a/inference-engine/src/cldnn_engine/ops/result.cpp +++ b/inference-engine/src/cldnn_engine/ops/result.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/result.hpp" -#include "api/reorder.hpp" +#include "cldnn/primitives/reorder.hpp" using namespace InferenceEngine; diff --git a/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp b/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp index 4537d3d34b7f0a..766bbc89a31d57 100644 --- a/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp +++ b/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/reverse_sequence.hpp" -#include "api/reverse_sequence.hpp" +#include "cldnn/primitives/reverse_sequence.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/rnn.cpp b/inference-engine/src/cldnn_engine/ops/rnn.cpp index a1bab946db42ec..2d4705f1a910fa 100644 --- a/inference-engine/src/cldnn_engine/ops/rnn.cpp +++ b/inference-engine/src/cldnn_engine/ops/rnn.cpp @@ -8,12 +8,12 @@ #include "ngraph/op/lstm_cell.hpp" #include "ngraph/op/lstm_sequence.hpp" -#include "api/reshape.hpp" -#include "api/reorder.hpp" -#include "api/fully_connected.hpp" -#include "api/lstm.hpp" -#include "api/crop.hpp" -#include "api/concatenation.hpp" +#include "cldnn/primitives/reshape.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/fully_connected.hpp" +#include "cldnn/primitives/lstm.hpp" +#include "cldnn/primitives/crop.hpp" +#include "cldnn/primitives/concatenation.hpp" namespace CLDNNPlugin { cldnn::activation_func GetActivationFunc(std::string name) { diff --git a/inference-engine/src/cldnn_engine/ops/roi_pooling.cpp b/inference-engine/src/cldnn_engine/ops/roi_pooling.cpp index 8dc57e3a5c63ff..f2087756405404 100644 --- a/inference-engine/src/cldnn_engine/ops/roi_pooling.cpp +++ b/inference-engine/src/cldnn_engine/ops/roi_pooling.cpp @@ -9,7 +9,7 @@ #include "ngraph/op/psroi_pooling.hpp" #include "ngraph/op/deformable_psroi_pooling.hpp" -#include "api/roi_pooling.hpp" +#include "cldnn/primitives/roi_pooling.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/scatter_elements_update.cpp b/inference-engine/src/cldnn_engine/ops/scatter_elements_update.cpp index 98c22e2a19966e..19f63dbf3a3986 100644 --- a/inference-engine/src/cldnn_engine/ops/scatter_elements_update.cpp +++ b/inference-engine/src/cldnn_engine/ops/scatter_elements_update.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/scatter_elements_update.hpp" #include "ngraph/op/constant.hpp" -#include "api/scatter_elements_update.hpp" +#include "cldnn/primitives/scatter_elements_update.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/scatter_nd_update.cpp b/inference-engine/src/cldnn_engine/ops/scatter_nd_update.cpp index 1b422736575054..cf5d059772a245 100644 --- a/inference-engine/src/cldnn_engine/ops/scatter_nd_update.cpp +++ b/inference-engine/src/cldnn_engine/ops/scatter_nd_update.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/scatter_nd_update.hpp" #include "ngraph/op/constant.hpp" -#include "api/scatter_nd_update.hpp" +#include "cldnn/primitives/scatter_nd_update.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/scatter_update.cpp b/inference-engine/src/cldnn_engine/ops/scatter_update.cpp index 85388032de8690..c6fb4e9c0aa003 100644 --- a/inference-engine/src/cldnn_engine/ops/scatter_update.cpp +++ b/inference-engine/src/cldnn_engine/ops/scatter_update.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/scatter_update.hpp" #include "ngraph/op/constant.hpp" -#include "api/scatter_update.hpp" +#include "cldnn/primitives/scatter_update.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/select.cpp b/inference-engine/src/cldnn_engine/ops/select.cpp index 0d3ae8e182c901..6b3866ac5dbaa9 100644 --- a/inference-engine/src/cldnn_engine/ops/select.cpp +++ b/inference-engine/src/cldnn_engine/ops/select.cpp @@ -7,9 +7,9 @@ #include "ngraph/op/select.hpp" -#include "api/select.hpp" -#include "api/reorder.hpp" -#include "api/reshape.hpp" +#include "cldnn/primitives/select.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/primitives/reshape.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/shuffle_channels.cpp b/inference-engine/src/cldnn_engine/ops/shuffle_channels.cpp index 0c0cafd03ef633..f3066ace26acc4 100644 --- a/inference-engine/src/cldnn_engine/ops/shuffle_channels.cpp +++ b/inference-engine/src/cldnn_engine/ops/shuffle_channels.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/shuffle_channels.hpp" -#include "api/shuffle_channels.hpp" +#include "cldnn/primitives/shuffle_channels.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/softmax.cpp b/inference-engine/src/cldnn_engine/ops/softmax.cpp index 566aca800ea398..cbaffc04accd49 100644 --- a/inference-engine/src/cldnn_engine/ops/softmax.cpp +++ b/inference-engine/src/cldnn_engine/ops/softmax.cpp @@ -8,8 +8,8 @@ #include "ngraph/op/softmax.hpp" #include "ngraph/op/log_softmax.hpp" -#include "api/softmax.hpp" -#include "api/activation.hpp" +#include "cldnn/primitives/softmax.hpp" +#include "cldnn/primitives/activation.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/space_to_batch.cpp b/inference-engine/src/cldnn_engine/ops/space_to_batch.cpp index 7846375dff700c..fa57d178038882 100644 --- a/inference-engine/src/cldnn_engine/ops/space_to_batch.cpp +++ b/inference-engine/src/cldnn_engine/ops/space_to_batch.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/space_to_batch.hpp" #include "ngraph/op/constant.hpp" -#include "api/space_to_batch.hpp" +#include "cldnn/primitives/space_to_batch.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/space_to_depth.cpp b/inference-engine/src/cldnn_engine/ops/space_to_depth.cpp index a8d73eea79d545..df4a25b469811a 100644 --- a/inference-engine/src/cldnn_engine/ops/space_to_depth.cpp +++ b/inference-engine/src/cldnn_engine/ops/space_to_depth.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/space_to_depth.hpp" -#include "api/space_to_depth.hpp" +#include "cldnn/primitives/space_to_depth.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/split.cpp b/inference-engine/src/cldnn_engine/ops/split.cpp index 6706ccebd8b88c..33b48c6d17fb2a 100644 --- a/inference-engine/src/cldnn_engine/ops/split.cpp +++ b/inference-engine/src/cldnn_engine/ops/split.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/split.hpp" #include "ngraph/op/variadic_split.hpp" -#include "api/crop.hpp" +#include "cldnn/primitives/crop.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/strided_slice.cpp b/inference-engine/src/cldnn_engine/ops/strided_slice.cpp index 0bf5adb74e22e4..8c796290a747da 100644 --- a/inference-engine/src/cldnn_engine/ops/strided_slice.cpp +++ b/inference-engine/src/cldnn_engine/ops/strided_slice.cpp @@ -8,9 +8,9 @@ #include "ngraph/op/strided_slice.hpp" #include "ngraph/op/constant.hpp" -#include "api/strided_slice.hpp" -#include "api/reshape.hpp" -#include "api/crop.hpp" +#include "cldnn/primitives/strided_slice.hpp" +#include "cldnn/primitives/reshape.hpp" +#include "cldnn/primitives/crop.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/tensor_iterator.cpp b/inference-engine/src/cldnn_engine/ops/tensor_iterator.cpp index e4e78208c0297e..d9843481979f23 100644 --- a/inference-engine/src/cldnn_engine/ops/tensor_iterator.cpp +++ b/inference-engine/src/cldnn_engine/ops/tensor_iterator.cpp @@ -13,11 +13,11 @@ #include "ngraph/op/constant.hpp" #include "ngraph/op/util/sub_graph_base.hpp" -#include "api/loop.hpp" -#include "api/mutable_data.hpp" -#include "api/data.hpp" -#include "api/reorder.hpp" -#include "api/topology.hpp" +#include "cldnn/primitives/loop.hpp" +#include "cldnn/primitives/mutable_data.hpp" +#include "cldnn/primitives/data.hpp" +#include "cldnn/primitives/reorder.hpp" +#include "cldnn/graph/topology.hpp" #include #include @@ -28,9 +28,8 @@ namespace CLDNNPlugin { template static DATA_TYPE CreateScalarData(Program &p, const cldnn::primitive_id& id, int64_t num) { - auto mem = cldnn::memory::allocate(p.GetEngine(), - { cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } }); - auto ptr = mem.pointer(); + auto mem = p.GetEngine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } }); + cldnn::mem_lock ptr{mem, p.GetEngine().get_program_stream()}; *ptr.begin() = num; return {id, mem}; } @@ -42,7 +41,7 @@ static cldnn::mutable_data CreateAdditionalOutputData(Program &p, const std::sha const auto format = DefaultFormatForDims(op->get_output_shape(output_idx).size()); const auto tensor = CldnnTensorFromIEDims(op->get_output_shape(output_idx)); cldnn::layout output_layout = cldnn::layout(precision, format, tensor); - auto mem = cldnn::memory::allocate(p.GetEngine(), output_layout); + auto mem = p.GetEngine().allocate_memory(output_layout); auto md = cldnn::mutable_data(id, {input}, mem); // cldnn::data cannot set dependency return md; } diff --git a/inference-engine/src/cldnn_engine/ops/tile.cpp b/inference-engine/src/cldnn_engine/ops/tile.cpp index a4856ad80dc210..aa91fbd3d5a410 100644 --- a/inference-engine/src/cldnn_engine/ops/tile.cpp +++ b/inference-engine/src/cldnn_engine/ops/tile.cpp @@ -7,7 +7,7 @@ #include "ngraph/op/tile.hpp" -#include "api/tile.hpp" +#include "cldnn/primitives/tile.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/topk.cpp b/inference-engine/src/cldnn_engine/ops/topk.cpp index d527aee1d61b6e..3d8f7e6521a112 100644 --- a/inference-engine/src/cldnn_engine/ops/topk.cpp +++ b/inference-engine/src/cldnn_engine/ops/topk.cpp @@ -7,8 +7,8 @@ #include "ngraph/op/topk.hpp" -#include "api/arg_max_min.hpp" -#include "api/mutable_data.hpp" +#include "cldnn/primitives/arg_max_min.hpp" +#include "cldnn/primitives/mutable_data.hpp" namespace CLDNNPlugin { @@ -71,7 +71,7 @@ void CreateTopKOp(Program& p, const std::shared_ptr& op) { DefaultFormatForDims(op->get_output_shape(1).size()), CldnnTensorFromIEDims(op->get_output_shape(1))); - auto shared_memory = cldnn::memory::allocate(p.GetEngine(), mutableLayout); + auto shared_memory = p.GetEngine().allocate_memory(mutableLayout); cldnn::primitive_id argmax_mutable_id_w = layer_type_name_ID(op) + "_md_write"; auto argmax_mutable_prim = cldnn::mutable_data(argmax_mutable_id_w, shared_memory); diff --git a/inference-engine/src/cldnn_engine/ops/transpose.cpp b/inference-engine/src/cldnn_engine/ops/transpose.cpp index a1c8ce63caef6f..f5de62923a2c3b 100644 --- a/inference-engine/src/cldnn_engine/ops/transpose.cpp +++ b/inference-engine/src/cldnn_engine/ops/transpose.cpp @@ -8,7 +8,7 @@ #include "ngraph/op/transpose.hpp" #include "ngraph/op/constant.hpp" -#include "api/permute.hpp" +#include "cldnn/primitives/permute.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/cldnn_engine/ops/unary.cpp b/inference-engine/src/cldnn_engine/ops/unary.cpp index 334d68c5f2379f..9a277a8be2aa04 100644 --- a/inference-engine/src/cldnn_engine/ops/unary.cpp +++ b/inference-engine/src/cldnn_engine/ops/unary.cpp @@ -41,7 +41,7 @@ #include "ngraph/op/hsigmoid.hpp" #include "ngraph/op/round.hpp" -#include "api/activation.hpp" +#include "cldnn/primitives/activation.hpp" namespace CLDNNPlugin { diff --git a/inference-engine/src/gna_plugin/CMakeLists.txt b/inference-engine/src/gna_plugin/CMakeLists.txt index f3ce2858570733..36b9d6d5cc0b8e 100644 --- a/inference-engine/src/gna_plugin/CMakeLists.txt +++ b/inference-engine/src/gna_plugin/CMakeLists.txt @@ -29,12 +29,15 @@ endif() # # Shared plugin library -# +# ie_add_plugin(NAME ${TARGET_NAME} DEVICE_NAME "GNA" SOURCES ${SOURCES} ${HEADERS}) +# Enable support of CC for the plugin +ie_mark_target_as_cc(${TARGET_NAME}) + # saving rpath to GNA shared library be used by CI log_rpath_from_dir(GNA ${libGNA_LIBRARIES_BASE_PATH}) @@ -67,7 +70,8 @@ target_compile_definitions(${TARGET_NAME}_test_static target_link_libraries(${TARGET_NAME}_test_static PUBLIC inference_engine_preproc_s inference_engine_transformations libGNA::API) target_include_directories(${TARGET_NAME}_test_static PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} - $) + $ + PRIVATE $) set_target_properties(${TARGET_NAME}_test_static PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_test_static) set_target_properties(${TARGET_NAME} ${TARGET_NAME}_test_static @@ -76,6 +80,6 @@ set_target_properties(${TARGET_NAME} ${TARGET_NAME}_test_static # install file(GLOB_RECURSE source_list "${libGNA_LIBRARIES_BASE_PATH}/*${CMAKE_SHARED_LIBRARY_SUFFIX}*") -install(FILES ${source_list} +install(FILES ${source_list} DESTINATION ${IE_CPACK_IE_DIR}/external/gna/lib COMPONENT gna) diff --git a/inference-engine/src/gna_plugin/backend/gna_limitations.cpp b/inference-engine/src/gna_plugin/backend/gna_limitations.cpp index 8443856a449946..6afe55bd043d93 100644 --- a/inference-engine/src/gna_plugin/backend/gna_limitations.cpp +++ b/inference-engine/src/gna_plugin/backend/gna_limitations.cpp @@ -5,14 +5,15 @@ #include "gna_limitations.hpp" #include +#include +#include +#include +#include +#include -using GNAPluginNS::GNALimitations::Cnn2D::Validator; -using GNAPluginNS::GNALimitations::Cnn2D::VectorOrSquareLimit; -using GNAPluginNS::GNALimitations::Cnn2D::VectorOrSquareLimitByChannels; -using GNAPluginNS::GNALimitations::Cnn2D::VectorOrSquareLimitByChannelsAndPrecision; -using GNAPluginNS::GNALimitations::Cnn2D::RangeLimit; -using GNAPluginNS::GNALimitations::Cnn2D::RangeLimit2D; -using GNAPluginNS::GNALimitations::Cnn2D::RangeMultipleLimit; +namespace GNAPluginNS { +namespace GNALimitations { +namespace Cnn2D { bool RangeLimit::isValid(const uint32_t val) const { return val >= min && val <= max; @@ -127,3 +128,65 @@ void Validator::ThrowIfNotEmpty(const std::string prefix, const std::string erro THROW_GNA_EXCEPTION << prefix << error; } } + +} // namespace Cnn2D + +bool AreLayersSupported(InferenceEngine::CNNNetwork& network, std::string& errMessage) { + IE_SUPPRESS_DEPRECATED_START + InferenceEngine::InputsDataMap inputs = network.getInputsInfo(); + std::unordered_set allLayers; + InferenceEngine::CNNLayerPtr startLayer; + if (inputs.empty()) { + auto outputs = network.getOutputsInfo(); + IE_ASSERT(!outputs.empty()); + // If there are no inputs start search from an output + startLayer = getCreatorLayer(outputs.begin()->second).lock(); + } else { + auto network_input_precision = inputs.begin()->second->getPrecision(); + + if (network_input_precision != InferenceEngine::Precision::FP32 && + network_input_precision != InferenceEngine::Precision::I16 && + network_input_precision != InferenceEngine::Precision::U8) { + errMessage = "The plugin does not support input precision with " + + std::string(network_input_precision.name()) + + " format. Supported input precisions FP32, I16, U8\n"; + return false; + } + + auto & secondLayers = getInputTo(inputs.begin()->second->getInputData()); + if (secondLayers.empty()) { + errMessage = "Network consists of input layer only (GNA)\n"; + return false; + } + startLayer = secondLayers.begin()->second; + } + auto batch_size = network.getBatchSize(); + + bool check_result = true; + InferenceEngine::details::UnorderedDFS(allLayers, + startLayer, + [&](const InferenceEngine::CNNLayerPtr layer) { + if (GNAPluginNS::LayerTypeFromStr(layer->type) == GNAPluginNS::LayerType::NO_TYPE) { + errMessage = "The plugin does not support layer: " + layer->name + ":" + layer->type + "\n"; + check_result = false; + } + if (batch_size != 1 && GNAPluginNS::LayerInfo::isBatchSizeConstrained(layer->type)) { + errMessage = "topology with layer: " + layer->name + ", type: " + layer->type + + ", and batch size(" + std::to_string(batch_size) + ") != 1 not supported"; + check_result = false; + } + if (GNAPluginNS::LayerInfo(layer).isFullyConnected()) { + size_t output_batch_size = GNAPluginNS::LayerInfo(layer).getOutputBatchSize(); + if (output_batch_size > 8) { + errMessage = "topology with layer: " + layer->name + ", type: " + layer->type + + ", and batch size(" + std::to_string(output_batch_size) + ") not supported"; + check_result = false; + } + } + }, false); + IE_SUPPRESS_DEPRECATED_END + return check_result; +} + +} // namespace GNALimitations +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp index 9b0eccaea59714..59dd0478cfa900 100644 --- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp +++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp @@ -6,17 +6,23 @@ #include "dnn_types.h" #include +#include namespace GNAPluginNS { namespace GNALimitations { +constexpr uint32_t bufferMaxSize = 65528; + constexpr uint32_t convMinFiltersNum = 4; constexpr uint32_t convMaxFiltersNum = 65532; constexpr uint32_t convFiltersNumDivider = 4; +constexpr uint32_t convFilterMaxSize = 768; constexpr uint32_t convEachKernelByteAlignment = 16; constexpr uint32_t noOfInputsDivisor = 8; constexpr uint32_t noOfInputsLowPrecDivisor = 16; +constexpr uint32_t affineMaxBatchSize = 8; + namespace Cnn2D { struct RangeLimit { uint32_t min; @@ -89,5 +95,8 @@ class Validator { const uint32_t strideH, const uint32_t strideW) const; }; } // namespace Cnn2D + +bool AreLayersSupported(InferenceEngine::CNNNetwork& network, std::string& errMessage); + } // namespace GNALimitations } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp index 1f3f125a029172..b57813858ac69a 100644 --- a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp +++ b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp @@ -15,6 +15,7 @@ #include "layer_quantizer.hpp" #include "scale_factor_calc.hpp" #include "weights_converter.hpp" +#include "gna_itt.hpp" namespace GNAPluginNS { @@ -40,6 +41,7 @@ class ModelQuantizer { template InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, const PreQuantisationCb &cb, std::vector scaleFactor) const { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "ModelQuantizer::quantize"); auto visitor = [&](InferenceEngine::CNNLayerPtr lp) { auto newLayer = InferenceEngine::injectData(lp); transformLayer(newLayer, WeightsConverter()); diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp index 3c1fdaac0e729b..11f13a7a9acad7 100644 --- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp +++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp @@ -370,14 +370,8 @@ class ScaleFactorPerLayer { auto minOutValue = quantizedParams->_dst_quant.GetMinValues().front(); auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front(); auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue)); - auto absMin = std::min(std::abs(minOutValue), std::abs(maxOutValue)); result = (quantizedParams->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue); - if (0 && fp32eq(absMin, 0.0f) && !fp32eq(absMax, 0.0f)) { - result = (quantizedParams->_dst_quant.GetLevels() - 1) / (2 * absMax); - } - // - //result = MAX_VAL_2B_FEAT / absMax; if (std::isinf(result) || fp32eq(absMax, 0.0f)) { result = max_activation_scale_factor; } @@ -401,6 +395,7 @@ class ScaleFactorPerLayer { (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) { auto prevLayerQuant = InferenceEngine::getInjectedData(*prevLayer); if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) && + prevLayerQuant->_src_quant.IsStatsSet() && (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) { result = prevLayerQuant->_src_quant.GetScale(); usePrevScaleFactor = true; diff --git a/inference-engine/src/gna_plugin/gna_device.cpp b/inference-engine/src/gna_plugin/gna_device.cpp index cbfc47f57aab0b..85a246ea34f134 100644 --- a/inference-engine/src/gna_plugin/gna_device.cpp +++ b/inference-engine/src/gna_plugin/gna_device.cpp @@ -96,14 +96,12 @@ void GNADeviceHelper::setUpActiveList(const uint32_t requestConfigId, uint32_t l const auto status = Gna2RequestConfigEnableActiveList(requestConfigId, layerIndex, num_active_indices, ptr_active_indices); checkGna2Status(status, "Gna2RequestConfigEnableActiveList"); } -void GNADeviceHelper::propagateSync(const uint32_t requestConfigId, Gna2AccelerationMode gna2AccelerationMode) { - wait(propagate(requestConfigId, gna2AccelerationMode)); -} uint32_t GNADeviceHelper::propagate(const uint32_t requestConfigId, Gna2AccelerationMode gna2AccelerationMode) { std::unique_lock lockGnaCalls{ acrossPluginsSync }; uint32_t reqId{}; - if (gna2AccelerationMode == Gna2AccelerationModeHardware && + if ((gna2AccelerationMode == Gna2AccelerationModeHardware || + gna2AccelerationMode == Gna2AccelerationModeHardwareWithSoftwareFallback) && detectedGnaDevVersion == Gna2DeviceVersionSoftwareEmulation) { gnawarn() << "GNA Device not detected, consider using other mode of acceleration"; } @@ -541,6 +539,8 @@ void GNADeviceHelper::updateGnaPerfCounters() { #if GNA_LIB_VER == 2 instrumentationTotal[0] = instrumentationResults[0]; instrumentationTotal[1] = instrumentationResults[1]; + instrumentationResults[0] = 0; + instrumentationResults[1] = 0; #else nGNAPerfResultsTotal.hw.stall = nGNAPerfResults.hw.stall; nGNAPerfResultsTotal.hw.total = nGNAPerfResults.hw.total; diff --git a/inference-engine/src/gna_plugin/gna_device.hpp b/inference-engine/src/gna_plugin/gna_device.hpp index e032e5532dafc3..cae32c70b1de3e 100644 --- a/inference-engine/src/gna_plugin/gna_device.hpp +++ b/inference-engine/src/gna_plugin/gna_device.hpp @@ -117,18 +117,12 @@ class GNADeviceHelper { uint8_t *alloc(uint32_t size_requested, uint32_t *size_granted); #if GNA_LIB_VER == 1 - void propagateSync(const intel_nnet_type_t *pNeuralNetwork, - const uint32_t *pActiveIndices, - uint32_t nActiveIndices, - intel_gna_proc_t nGNAProcType); - uint32_t propagate(const intel_nnet_type_t *pNeuralNetwork, const uint32_t *pActiveIndices, uint32_t nActiveIndices, intel_gna_proc_t nGNAProcType); #else void setUpActiveList(unsigned req_config_id, uint32_t layerIndex, uint32_t* ptr_active_indices, uint32_t num_active_indices); - void propagateSync(const uint32_t requestConfigId, Gna2AccelerationMode gna2AccelerationMode); uint32_t propagate(const uint32_t requestConfigId, Gna2AccelerationMode gna2AccelerationMode); uint32_t createModel(Gna2Model& gnaModel) const; void releaseModel(const uint32_t model_id); diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp index 23685b4734faf9..e48595d6c272f3 100644 --- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp +++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp @@ -34,7 +34,6 @@ #include "layers/gna_crop_layer.hpp" #include "layers/gna_fake_quantize_layer.hpp" #include "round_float_define.hpp" -#include "gna_plugin_policy.hpp" #include "gna_groups.hpp" #include "backend/gna_limitations.hpp" @@ -62,10 +61,6 @@ void GNAGraphCompiler::setGNAFlagsPtr(std::shared_ptr gna this->gnaFlags = std::move(gnaFlagsPtr); } -void GNAGraphCompiler::setPolicy(GNAPluginNS::Policy policyToSet) { - this->policy = policyToSet; -} - intel_dnn_component_t * GNAGraphCompiler::find_first_unused_input(InferenceEngine::CNNLayerPtr current) { if (current->insData.empty()) return nullptr; @@ -158,25 +153,27 @@ void GNAGraphCompiler::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) THROW_GNA_LAYER_EXCEPTION(layer) << " outData["<< i << "]" << " connected by " << j <<" connection doesnt connect to functional layer"; } - auto dataOutput = outFunctionalLayer.first->insData[outFunctionalLayer.second].lock(); - - padding = std::max(padding, LayerInfo(outFunctionalLayer.first).paddingSize()) - * dataOutput->getPrecision().size(); - output_layer_size = - InferenceEngine::details::product(begin(dataOutput->getDims()), - end(dataOutput->getDims())) * dataOutput->getPrecision().size(); - - if (LayerInfo(outFunctionalLayer.first).isAffineFilter()) { - size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset"); - layerInfoItem.splitOutputLayers.emplace_back( - outFunctionalLayer.first, - outFunctionalLayer.second, - aligned64_offset * dataOutput->getPrecision().size(), - output_layer_size); - } else { - layerInfoItem.splitOutputLayers.emplace_back( - outFunctionalLayer.first, outFunctionalLayer.second, split_size, output_layer_size); - } + for (int idx : outFunctionalLayer.second) { + auto dataOutput = outFunctionalLayer.first->insData[idx].lock(); + + padding = std::max(padding, LayerInfo(outFunctionalLayer.first).paddingSize()) + * dataOutput->getPrecision().size(); + output_layer_size = + InferenceEngine::details::product(begin(dataOutput->getDims()), + end(dataOutput->getDims())) * dataOutput->getPrecision().size(); + + if (LayerInfo(outFunctionalLayer.first).isAffineFilter()) { + size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset"); + layerInfoItem.splitOutputLayers.emplace_back( + outFunctionalLayer.first, + idx, + aligned64_offset * dataOutput->getPrecision().size(), + output_layer_size); + } else { + layerInfoItem.splitOutputLayers.emplace_back( + outFunctionalLayer.first, idx, split_size, output_layer_size); + } + } } // in case of unconnected split - we need properly increment size @@ -545,10 +542,7 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP auto effectiveInputWidth = in_width; auto effectiveInputHeight = in_height; - if (policy.cnn2dInputPaddingSupported) { - effectiveInputWidth += convolution._padding_x * 2; - effectiveInputHeight += convolution._padding_y * 2; - } else if (convolution._padding_x != 0 || convolution._padding_y != 0 || + if (convolution._padding_x != 0 || convolution._padding_y != 0 || convolution._pads_end.at(X_AXIS) != 0 || convolution._pads_end.at(Y_AXIS) != 0) { THROW_GNA_LAYER_EXCEPTION(layer) << "Convolution's input padding is not supported"; } @@ -1639,7 +1633,7 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l uint32_t num_rows_copied = 0; // in case of left alignment succeed, but due to number of elements not multiple of 8 we need to insert align_filter // we are improving it by inserting copy layer of size that covers most of elements - remained max of 32x31 affine filter - if (policy.ConcatAlignmentPolicy == Policy::ConcatAlignment::FAST && 0 == numRowsPadded && ALIGN(num_rows_in, 32) > 32) { + if (0 == numRowsPadded && ALIGN(num_rows_in, 32) > 32) { // can we use copy at all num_rows_copied = ALIGN(num_rows_in, 32) - 32; diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.hpp b/inference-engine/src/gna_plugin/gna_graph_compiler.hpp index 148c34a82ff5a7..d761d91739269c 100644 --- a/inference-engine/src/gna_plugin/gna_graph_compiler.hpp +++ b/inference-engine/src/gna_plugin/gna_graph_compiler.hpp @@ -27,7 +27,6 @@ #include "backend/gna_limitations.hpp" #include "gna_device.hpp" #include "gna_data_types.hpp" -#include "gna_plugin_policy.hpp" namespace GNAPluginNS { class GNAGraphCompiler { @@ -36,7 +35,6 @@ class GNAGraphCompiler { std::shared_ptr gnamem; std::shared_ptr inputDesc; std::shared_ptr gnaFlags; - Policy policy; // layers with extra storage for connections and additional // non trivial processing @@ -64,7 +62,6 @@ class GNAGraphCompiler { void setDNNPtr(std::shared_ptr dnnPtr); void setInputDescPtr(std::shared_ptr inputDescPtr); void setGNAFlagsPtr(std::shared_ptr gnaFlagsPtr); - void setPolicy(GNAPluginNS::Policy policy); void fillMemoryConnections(std::unordered_map> &memoryPairs); diff --git a/inference-engine/src/gna_plugin/gna_graph_tools.hpp b/inference-engine/src/gna_plugin/gna_graph_tools.hpp index e9cf70790acdb9..51701268209728 100644 --- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp +++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp @@ -155,14 +155,14 @@ inline InferenceEngine::CNNLayerPtr CNNNetPrevLayerSkipCertain(Layer layer, int */ template -inline std::pair CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck, +inline std::pair> CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck, const std::function &shouldSkip) { if (oidx >= layer->outData.size()) { - if (bOnlyCheck) return {nullptr, 0}; + if (bOnlyCheck) return {nullptr, {}}; THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx; } if (getInputTo(layer->outData[oidx]).empty() || iidx >= getInputTo(layer->outData[oidx]).size()) { - if (bOnlyCheck) return {nullptr, 0}; + if (bOnlyCheck) return {nullptr, {}}; THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx << " and inputTo index: " << iidx; } @@ -174,12 +174,12 @@ inline std::pair CNNNetCheckNextLayerSkipCer while (shouldSkip(outLayer->second)) { if (outLayer->second->outData.size() <= new_oidx) { - if (bOnlyCheck) return { nullptr, 0 }; + if (bOnlyCheck) return { nullptr, {} }; THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx; } if (getInputTo(outLayer->second->outData[new_oidx]).size() <= new_iidx) { - if (bOnlyCheck) return { nullptr, 0 }; + if (bOnlyCheck) return { nullptr, {} }; THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx << " and inputTo index: " << new_iidx; } @@ -188,11 +188,7 @@ inline std::pair CNNNetCheckNextLayerSkipCer } auto insDataIdx = CNNLayerFindInsDataIdxes(layer->outData[new_oidx], outLayer->second); - if (insDataIdx.size() != 1) { - if (bOnlyCheck) return { nullptr, 0 }; - THROW_GNA_LAYER_EXCEPTION(layer) << " has multiple connection to " << new_oidx << " outData"; - } - return { outLayer->second, insDataIdx.front() }; + return { outLayer->second, insDataIdx }; } /** @@ -256,7 +252,7 @@ inline std::pair CNNNetCheckNextLayerSkipCer /// @brief alias for strict checkNextLayer (false) template -inline std::pair CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx, +inline std::pair> CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx, const std::function &shouldSkip) { return CNNNetCheckNextLayerSkipCertain(layer, oidx, iidx, false, shouldSkip); } diff --git a/inference-engine/src/gna_plugin/gna_groups.hpp b/inference-engine/src/gna_plugin/gna_groups.hpp index 21abe5d01246f4..704588a153d039 100644 --- a/inference-engine/src/gna_plugin/gna_groups.hpp +++ b/inference-engine/src/gna_plugin/gna_groups.hpp @@ -46,22 +46,10 @@ inline InferenceEngine::DataPtr Get2DReshapedData(InferenceEngine::DataPtr input * @param layer */ inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) { - if (GNAPluginNS::LayerInfo(layer).isPower()) + if (GNAPluginNS::LayerInfo(layer).isPower() || GNAPluginNS::LayerInfo(layer).isCopy()) return true; - if (!GNAPluginNS::LayerInfo(layer).isScaleShift()) - return false; - - // Don't reshape user-defined ScaleShift layers - if (layer->name.rfind("SyntheticScaleShift", 0) == std::string::npos) - return false; - - // Don't reshape the first dnn layer since it breaks groups recognition - auto prevLayer = InferenceEngine::CNNNetPrevLayerSkipCertain(layer, 0, [](InferenceEngine::CNNLayerPtr ptr) { - return LayerInfo(ptr).isNonValuesChangable(); - }); - IE_ASSERT(prevLayer != nullptr); - if (LayerInfo(prevLayer).isInput()) + if (!GNAPluginNS::LayerInfo(layer).isSyntheticScaleShift()) return false; // Don't reshape diagonallayers with bias connection diff --git a/inference-engine/src/gna_plugin/gna_itt.hpp b/inference-engine/src/gna_plugin/gna_itt.hpp new file mode 100644 index 00000000000000..3fa0211973329e --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_itt.hpp @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief Defines openvino domains for tracing + * @file gna_itt.hpp + */ + +#pragma once + +#include + +namespace GNAPluginNS { +namespace itt { +namespace domains { + OV_ITT_DOMAIN(GNAPlugin); + OV_ITT_DOMAIN(GNA_LT); +} +} +} diff --git a/inference-engine/src/gna_plugin/gna_model_serial.cpp b/inference-engine/src/gna_plugin/gna_model_serial.cpp index fdb99d7f273bc1..e32ded8a9e37ed 100644 --- a/inference-engine/src/gna_plugin/gna_model_serial.cpp +++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #endif @@ -133,10 +134,11 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream & } case 5: case 6: + case 7: readNBytes(&header, sizeof(HeaderLatest::ModelHeader), is); break; default: - THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 4 and is: " << header.version.minor; + THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << header.version.minor; } break; default: @@ -154,6 +156,40 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream & return header; } +GNAPluginNS::HeaderLatest::RuntimeEndPoint GNAModelSerial::ReadEndPoint(std::istream &is) { + is.exceptions(std::istream::failbit); + + HeaderLatest::RuntimeEndPoint endPoint; + switch (modelHeader.version.major) { + case 2: + switch (modelHeader.version.minor) { + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + { + Header2dot6::RuntimeEndPoint tempEndPoint2dot6; + readBits(tempEndPoint2dot6, is); + endPoint = HeaderLatest::RuntimeEndPoint(tempEndPoint2dot6, modelHeader.nGroup); + break; + } + case 7: + readNBytes(&endPoint, sizeof(HeaderLatest::RuntimeEndPoint), is); + break; + default: + THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << modelHeader.version.minor; + } + break; + default: + THROW_GNA_EXCEPTION << "Imported file unsupported. Import for files with major version equal to: " + << modelHeader.version.major << " is not implemented"; + } + + return endPoint; +} + #define offsetFromBase(field)\ getOffsetFromBase(field, #field) @@ -324,18 +360,6 @@ void GNAModelSerial::Import(void *basePointer, is.read(reinterpret_cast(basePointer), gnaGraphSize); } - -uint32_t guessGrouping(Gna2Model const& model) { - if (model.NumberOfOperations == 0 || - model.Operations == nullptr || - model.Operations[0].Operands == nullptr || - model.Operations[0].NumberOfOperands == 0 || - model.Operations[0].Operands[0]->Shape.NumberOfDimensions < 2) { - THROW_GNA_EXCEPTION << "Can not guess grouping"; - } - return (std::min)(model.Operations[0].Operands[0]->Shape.Dimensions[0], model.Operations[0].Operands[0]->Shape.Dimensions[1]); -} - void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const { os.exceptions(std::ostream::failbit); @@ -366,6 +390,9 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea out.descriptor_offset = offsetFromBase(ep.descriptor_ptr); out.scaleFactor = ep.scaleFactor; out.element_size = ep.element_size; + out.shape = ep.shape; + out.layout = ep.layout; + out.precision = ep.precision; out.orientation = ep.orientation; return out; }; @@ -381,7 +408,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea header.headerSize = sizeof(HeaderLatest::ModelHeader); header.gnaMemSize = gnaGraphSize; header.layersCount = layers.size(); - header.nGroup = guessGrouping(*gna2Model); + header.nGroup = 1; // just to support the old models header.nInputs = inputs.size(); header.nOutputs = outputs.size(); header.nTransposeInputs = transposeInputsInfo.size(); @@ -796,13 +823,22 @@ std::vector GNAModelSerial::serializeOutputs(cons std::size_t outputIndex = 0; for (auto const &output : outputsDataMap) { auto outputName = output.first; - auto inputDims = output.second->getTensorDesc().getDims(); - uint32_t elementsCount = static_cast(InferenceEngine::details::product(inputDims.begin(), inputDims.end())); - + auto outputDims = output.second->getTensorDesc().getDims(); + HeaderLatest::RuntimeEndPoint::Shape outputShape; + outputShape.NumberOfDimensions = outputDims.size(); + for (size_t i=0; i < outputShape.NumberOfDimensions; ++i) { + outputShape.Dimensions[i] = static_cast(outputDims[i]); + } + uint32_t elementsCount = static_cast(InferenceEngine::details::product(outputDims.begin(), outputDims.end())); + InferenceEngine::Layout outputLayout = output.second->getLayout(); + InferenceEngine::Precision::ePrecision outputPrecision = InferenceEngine::Precision::FP32; HeaderLatest::RuntimeEndPoint endPoint(outputsDesc[outputIndex].scale_factor, outputsDesc[outputIndex].ptrs[0], outputsDesc[outputIndex].num_bytes_per_element, elementsCount, + outputShape, + outputLayout, + outputPrecision, outputsDesc[outputIndex].orientation); endPoints.push_back(endPoint); outputIndex++; @@ -818,18 +854,26 @@ std::vector GNAModelSerial::serializeInputs(const for (auto const& input : inputsDataMap) { auto inputName = input.first; auto inputDims = input.second->getTensorDesc().getDims(); - + HeaderLatest::RuntimeEndPoint::Shape inputShape; + inputShape.NumberOfDimensions = inputDims.size(); + for (size_t i=0; i < inputShape.NumberOfDimensions; ++i) { + inputShape.Dimensions[i] = static_cast(inputDims[i]); + } double scaleFactor = inputDesc->getScaleFactor(inputIndex); std::vector descriptor_ptr = inputDesc->getPtrInputsGlobal(inputName); IE_ASSERT(descriptor_ptr.size() > 0); uint32_t element_size = 2u; uint32_t elementsCount = static_cast(InferenceEngine::details::product(inputDims.begin(), inputDims.end())); intel_dnn_orientation_t orientation = inputDesc->getOrientation(inputName); - + InferenceEngine::Layout inputLayout = input.second->getLayout(); + InferenceEngine::Precision::ePrecision inputPrecision = InferenceEngine::Precision::FP32; HeaderLatest::RuntimeEndPoint endPoint(scaleFactor, descriptor_ptr[0], element_size, elementsCount, + inputShape, + inputLayout, + inputPrecision, orientation); endPoints.push_back(endPoint); inputIndex++; @@ -846,20 +890,24 @@ void GNAModelSerial::ImportInputs(std::istream &is, for (uint32_t inputIndex = 0; inputIndex < modelHeader.nInputs; inputIndex++) { const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3) ? inputNames.at(inputIndex) : std::string("input" + std::to_string(inputIndex)); - HeaderLatest::RuntimeEndPoint input; - is.read(reinterpret_cast(&input), sizeof(input)); + + HeaderLatest::RuntimeEndPoint input = ReadEndPoint(is); inputsDesc->getPtrInputsGlobal(name).push_back(reinterpret_cast(reinterpret_cast (basePtr) + input.descriptor_offset)); inputsDesc->orientation_in[name] = input.orientation; inputsDesc->bytes_allocated_for_input[name] = input.element_size * input.elements_count; - auto inputDims = InferenceEngine::SizeVector({modelHeader.nGroup, input.elements_count / modelHeader.nGroup}); - + auto inputDims = InferenceEngine::SizeVector(); + for (auto i = 0; i < input.shape.NumberOfDimensions; ++i) { + inputDims.push_back(input.shape.Dimensions[i]); + } + InferenceEngine::Layout inputLayout = static_cast(input.layout); + InferenceEngine::Precision inputPresicion = InferenceEngine::Precision(static_cast(input.precision)); dataMap[name] = std::make_shared(); dataMap[name]->setInputData(std::make_shared(name, InferenceEngine::TensorDesc( - InferenceEngine::Precision::FP32, + inputPresicion, inputDims, - InferenceEngine::Layout::NC))); + inputLayout))); inputsDesc->inputScaleFactors.push_back(input.scaleFactor); } } @@ -875,8 +923,8 @@ void GNAModelSerial::ImportOutputs(std::istream &is, for (uint32_t outputIndex = 0; outputIndex < modelHeader.nOutputs; outputIndex++) { const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3) ? outputNames.at(outputIndex) : std::string("output" + std::to_string(outputIndex)); - HeaderLatest::RuntimeEndPoint output; - is.read(reinterpret_cast(&output), sizeof(output)); + + HeaderLatest::RuntimeEndPoint output = ReadEndPoint(is); OutputDesc description; description.ptrs.push_back(reinterpret_cast(reinterpret_cast (basePtr) + output.descriptor_offset)); description.orientation = kDnnInterleavedOrientation; @@ -884,12 +932,17 @@ void GNAModelSerial::ImportOutputs(std::istream &is, description.num_bytes_per_element = output.element_size; description.scale_factor = output.scaleFactor; - auto outputDims = InferenceEngine::SizeVector({modelHeader.nGroup, output.elements_count / modelHeader.nGroup}); + auto outputDims = InferenceEngine::SizeVector(); + for (auto i = 0; i < output.shape.NumberOfDimensions; ++i) { + outputDims.push_back(output.shape.Dimensions[i]); + } + InferenceEngine::Layout outputLayout = static_cast(output.layout); + InferenceEngine::Precision outputPresicion = InferenceEngine::Precision(static_cast(output.precision)); dataMap[name] = std::make_shared(name, InferenceEngine::TensorDesc( - InferenceEngine::Precision::FP32, + outputPresicion, outputDims, - InferenceEngine::Layout::NC)); + outputLayout)); desc.at(outputIndex) = description; } } diff --git a/inference-engine/src/gna_plugin/gna_model_serial.hpp b/inference-engine/src/gna_plugin/gna_model_serial.hpp index d756a23f9fc0ab..f5310d826c4c5f 100644 --- a/inference-engine/src/gna_plugin/gna_model_serial.hpp +++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp @@ -138,6 +138,8 @@ class GNAModelSerial { */ static GNAPluginNS::HeaderLatest::ModelHeader ReadHeader(std::istream &is); + GNAPluginNS::HeaderLatest::RuntimeEndPoint ReadEndPoint(std::istream &is); + /** * @brief Import model from FS into preallocated buffer, * buffers for pLayers, and pStructs are allocated here and required manual deallocation using mm_free diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index e76eafa6d5372a..d6944f0c621fd4 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -37,7 +37,7 @@ #include #include "gna_graph_patterns.hpp" #include "gna_tensor_tools.hpp" -#include +#include "gna_itt.hpp" #include #include @@ -54,12 +54,17 @@ #include #include #include +#include #include "transformations/remove_extra_reshapes.hpp" #include "transformations/insert_transpose_after_convolution_or_pooling.hpp" #include "transformations/insert_transpose_before_matmul.hpp" #include "transformations/reorder_activation_and_pooling.hpp" #include "transformations/swap_input_matmul_gna.hpp" +#include "transformations/convert_matmul_to_pointwise_convolution.hpp" +#include "transformations/split_convolution_with_large_buffer_size.hpp" + +#include #if GNA_LIB_VER == 2 #include @@ -386,6 +391,7 @@ GNAPlugin::GNAPlugin(const std::map& configMap) { } void GNAPlugin::Init() { + OV_ITT_SCOPED_TASK(itt::domains::GNAPlugin, "Init"); dnn = std::make_shared(backend::AMIntelDNN()); inputsDesc = std::make_shared(GNAPluginNS::InputDesc()); gnaFlags = std::make_shared(GNAPluginNS::GNAFlags()); @@ -396,6 +402,7 @@ void GNAPlugin::Init() { } void GNAPlugin::InitGNADevice() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InitGNADevice"); #if GNA_LIB_VER == 1 gnadevice = std::make_shared(gnaFlags->gna_lib_async_threads_num, gnaFlags->gna_openmp_multithreading, @@ -414,6 +421,7 @@ void GNAPlugin::InitGNADevice() { } void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::CNNNetwork & network) { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UpdateGnaQuantModeFromNetwork"); // fp32 emulation mode dont need any modifications to configuration if (config.gnaFlags.sw_fp32) return; @@ -449,6 +457,7 @@ void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::CNNNetwork & netw } void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & network) { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UpdateInputScaleFromNetwork"); // fp32 emulation mode dont need any modifications to configuration if (config.gnaFlags.sw_fp32) return; @@ -556,6 +565,7 @@ bool GNAPlugin::TryToInitOutput(int portId, InferenceEngine::CNNLayerPtr layer) } void GNAPlugin::FillInputsAndOutputsTranspositionInfo(const InferenceEngine::CNNNetwork& net) { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "FillInputsAndOutputsTranspositionInfo"); auto printTranspositionInfo = [](const std::vector &transpositionInfo) { for (const auto &transpositionInfoPart : transpositionInfo) { gnalog() << "transpose=" << transpositionInfoPart.transpose << " rows_num=" << transpositionInfoPart.num_transpose_rows @@ -658,6 +668,7 @@ void GNAPlugin::AddDebugProperties(const InferenceEngine::CNNLayerPtr layer, #endif void GNAPlugin::LoadNetwork(CNNNetwork & _network) { + OV_ITT_SCOPED_TASK(itt::domains::GNAPlugin, "LoadNetwork"); std::shared_ptr convertedNetwork; if (_network.getFunction()) { CNNNetwork clonedNetwork = InferenceEngine::cloneNetwork(_network); @@ -667,6 +678,15 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { // WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass manager.register_pass(); manager.register_pass(); + // TODO enable this transformation for networks with convolutions + if (!ngraph::op::util::has_op_with_type(graph)) { + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); + } + manager.register_pass(); + manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); manager.register_pass(); @@ -703,7 +723,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { // Check the input network std::string error; - if (!AreLayersSupported(network, error)) { + if (!GNAPluginNS::GNALimitations::AreLayersSupported(network, error)) { THROW_GNA_EXCEPTION << error.c_str(); } @@ -718,7 +738,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { // network optimisation phases int passIdx = 0; auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy, bool lowPrecision) { - auto passes = make_shared(PassManagerSettings{policy, runBeforeCopy, lowPrecision}, network); + auto passes = make_shared(PassManagerSettings{runBeforeCopy, lowPrecision}, network); passes->registerPass(); passes->registerPass(); passes->registerPass(); @@ -735,6 +755,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { passes->registerPass(); passes->registerPass(); + passes->registerPass(); passes->registerPass(); passes->registerPass(); passes->registerPass(); @@ -744,16 +765,9 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { passes->registerPass(); passes->registerPass(); passes->registerPass(); - if (policy.PermutePolicy != Policy::Permute::DISABLED) { - passes->registerPass(); - } - if (policy.NHWCToNCHWPolicy != Policy::NHWCToNCHW::DISABLED) { - passes->registerPass(); - } - + passes->registerPass(); passes->registerPass(); passes->registerPass(); - passes->registerPass(); passes->registerPass(); passes->registerPass(); #if GNA_LIB_VER == 2 @@ -820,9 +834,6 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { auto sortedNet = CNNNetSortTopologicallyEx(newNet, make_fuzed_order); - // passing policy to compiler - graphCompiler.setPolicy(policy); - if (sortedNet.empty()) { THROW_GNA_EXCEPTION << "Sorted network is empty"; } @@ -1465,7 +1476,11 @@ static InferenceEngine::Layout GetLayoutForDims(const InferenceEngine::SizeVecto Blob::Ptr GNAPlugin::GetOutputBlob(const std::string& name, InferenceEngine::Precision precision) { // need to have intermediate blob for interleave conversion InferenceEngine::Blob::Ptr outputBlob; - auto outputDims = outputsDataMap[name]->getTensorDesc().getDims(); + auto outputDataIt = outputsDataMap.find(name); + if (outputDataIt == std::end(outputsDataMap)) { + THROW_GNA_EXCEPTION << "Output " << name << " isn't found"; + } + auto outputDims = outputDataIt->second->getTensorDesc().getDims(); outputBlob = make_blob_with_precision(TensorDesc(precision, outputDims, GetLayoutForDims(outputDims))); outputBlob->allocate(); return outputBlob; @@ -1475,7 +1490,11 @@ Blob::Ptr GNAPlugin::GetInputBlob(const std::string& name, InferenceEngine::Prec InferenceEngine::Blob::Ptr inputBlob; // need to have intermediate blob for interleave conversion // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not - auto inputDims = inputsDataMap[name]->getTensorDesc().getDims(); + auto inputDataIt = inputsDataMap.find(name); + if (inputDataIt == std::end(inputsDataMap)) { + THROW_GNA_EXCEPTION << "Input " << name << " isn't found"; + } + auto inputDims = inputDataIt->second->getTensorDesc().getDims(); inputBlob = make_blob_with_precision(TensorDesc(precision, inputDims, GetLayoutForDims(inputDims))); inputBlob->allocate(); return inputBlob; diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp index 1b37439eead68e..2ce54f4f9ab292 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin.hpp @@ -21,7 +21,6 @@ #include "backend/am_intel_dnn.hpp" #include "gna_data_types.hpp" #include "gna_graph_compiler.hpp" -#include "gna_plugin_policy.hpp" #include "gna_plugin_log.hpp" #include "gna_plugin_config.hpp" #include @@ -69,8 +68,6 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin { intel_dnn_number_type_t output_type = kDnnInt; - GNAPluginNS::Policy policy; - #if GNA_LIB_VER == 2 void createRequestConfigsForGnaModels(); #endif @@ -158,11 +155,6 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin { INFERENCE_ENGINE_DEPRECATED("Use InferRequest::QueryState instead") std::vector QueryState(); - /** - * test-wise API - */ - void SetPolicy(GNAPluginNS::Policy p) {policy = p;} - /** * QueryMetrics API */ diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.cpp b/inference-engine/src/gna_plugin/gna_plugin_config.cpp index 2dcb05d6ab8a3f..766e7d2d52c609 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_config.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin_config.cpp @@ -23,6 +23,7 @@ static const caseless_unordered_map supported_values = { {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE} }; static const std::vector supported_values_on_gna2 = { + GNAConfigParams::GNA_HW_WITH_SW_FBACK, GNAConfigParams::GNA_GEN, GNAConfigParams::GNA_GEN_EXACT, GNAConfigParams::GNA_SSE, @@ -34,18 +35,19 @@ static const std::vector supported_values_on_gna2 = { }; #else static const caseless_unordered_map > supported_values = { - {GNAConfigParams::GNA_AUTO, {Gna2AccelerationModeAuto, false}}, - {GNAConfigParams::GNA_HW, {Gna2AccelerationModeHardware, false}}, - {GNAConfigParams::GNA_SW, {Gna2AccelerationModeSoftware, false}}, - {GNAConfigParams::GNA_SW_EXACT, {Gna2AccelerationModeSoftware, true}}, - {GNAConfigParams::GNA_GEN, {Gna2AccelerationModeGeneric, false}}, - {GNAConfigParams::GNA_GEN_EXACT, {Gna2AccelerationModeGeneric, true}}, - {GNAConfigParams::GNA_SSE, {Gna2AccelerationModeSse4x2, false}}, - {GNAConfigParams::GNA_SSE_EXACT, {Gna2AccelerationModeSse4x2, true}}, - {GNAConfigParams::GNA_AVX1, {Gna2AccelerationModeAvx1, false}}, - {GNAConfigParams::GNA_AVX1_EXACT, {Gna2AccelerationModeAvx1, true}}, - {GNAConfigParams::GNA_AVX2, {Gna2AccelerationModeAvx2, false}}, - {GNAConfigParams::GNA_AVX2_EXACT, {Gna2AccelerationModeAvx2, true}}, + {GNAConfigParams::GNA_AUTO, {Gna2AccelerationModeAuto, false}}, + {GNAConfigParams::GNA_HW, {Gna2AccelerationModeHardware, false}}, + {GNAConfigParams::GNA_HW_WITH_SW_FBACK, {Gna2AccelerationModeHardwareWithSoftwareFallback, false}}, + {GNAConfigParams::GNA_SW, {Gna2AccelerationModeSoftware, false}}, + {GNAConfigParams::GNA_SW_EXACT, {Gna2AccelerationModeSoftware, true}}, + {GNAConfigParams::GNA_GEN, {Gna2AccelerationModeGeneric, false}}, + {GNAConfigParams::GNA_GEN_EXACT, {Gna2AccelerationModeGeneric, true}}, + {GNAConfigParams::GNA_SSE, {Gna2AccelerationModeSse4x2, false}}, + {GNAConfigParams::GNA_SSE_EXACT, {Gna2AccelerationModeSse4x2, true}}, + {GNAConfigParams::GNA_AVX1, {Gna2AccelerationModeAvx1, false}}, + {GNAConfigParams::GNA_AVX1_EXACT, {Gna2AccelerationModeAvx1, true}}, + {GNAConfigParams::GNA_AVX2, {Gna2AccelerationModeAvx2, false}}, + {GNAConfigParams::GNA_AVX2_EXACT, {Gna2AccelerationModeAvx2, true}}, }; #endif diff --git a/inference-engine/src/gna_plugin/gna_plugin_policy.hpp b/inference-engine/src/gna_plugin/gna_plugin_policy.hpp deleted file mode 100644 index 0611ca0993c73c..00000000000000 --- a/inference-engine/src/gna_plugin/gna_plugin_policy.hpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -namespace GNAPluginNS { -/** - * @brief policy agregates various settings that cannot be tweak using configuration options right now, - * and essential to keep test coverage for options both in on and off cases - */ -class Policy { - public: - /** - * @brief for scaleshift substitution, weight tiling simplify final graph but have extra weights overhead - * if not defined scaleshift broadcast will result in creating multiple diagonal layers instead of weight tiling - */ - enum class ScaleShift { - WEIGHTS_TILING, - /** - * GNA has limited amount of batch so even existed topologies cannot be substituted with only batching, - * this option combines batch and weights tiling - */ - BATCH_AND_WEIGHTS_TILING, - DIAGLAYER_TILING - } ScaleShiftPolicy = ScaleShift::WEIGHTS_TILING; - - /** - * Policy on whether to substitute permute layers or not - */ - enum class Permute { - DISABLED, - AUTO_PERMUTE - } PermutePolicy = Permute::DISABLED; - - enum class FlattenTrivialConcatConversion { - DISABLED, - ENABLED - } ConcatConversionPolicy = FlattenTrivialConcatConversion::ENABLED; - - enum class ConcatAlignment { - DISABLED, - DISABLED_FOR_FP32, - ENABLED, - FAST - } ConcatAlignmentPolicy = ConcatAlignment::FAST; - - /** - * Policy to support --disable_nhwc_to_nchw option in MO - */ - enum class NHWCToNCHW { - DISABLED, - REMOVE_LAST, - REMOVE_ALL - } NHWCToNCHWPolicy = NHWCToNCHW::REMOVE_ALL; - - /** - * @brief trim of gna diagonal affine layer maximum elements number - */ - class GNAAffineDiagonal { - public: - enum : uint32_t { - UNLIMIT, - // gna limit this to be OxFFFF - LIMITED_TO_DEFAULT_GNA2_65536 = 65536 - 64 - }; - uint32_t limitedTo = LIMITED_TO_DEFAULT_GNA2_65536; - } GNAAffineDiagonalPolicy; - - bool cnn2dInputPaddingSupported = false; -}; - -inline std::ostream& operator<<(std::ostream& os, Policy::ScaleShift policy) { - switch (policy) { - case Policy::ScaleShift::WEIGHTS_TILING : os << "WEIGHTS_TILING"; break; - case Policy::ScaleShift::BATCH_AND_WEIGHTS_TILING: os << "BATCH_AND_WEIGHTS_TILING"; break; - case Policy::ScaleShift::DIAGLAYER_TILING : os << "DIAGLAYER_TILING"; break; - default : os.setstate(std::ios_base::failbit); - } - return os; -} - -inline std::ostream& operator<<(std::ostream& os, Policy::ConcatAlignment policy) { - switch (policy) { - case Policy::ConcatAlignment::DISABLED : os << "DISABLED"; break; - case Policy::ConcatAlignment::DISABLED_FOR_FP32 : os << "DISABLED_FOR_FP32"; break; - case Policy::ConcatAlignment::ENABLED : os << "ENABLED"; break; - case Policy::ConcatAlignment::FAST : os << "FAST"; break; - default : os.setstate(std::ios_base::failbit); - } - return os; -} - - -} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp index a4681bdff41e01..93fb4417dc7296 100644 --- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp +++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp @@ -90,6 +90,32 @@ class LayerInfo { static InferenceEngine::details::caseless_set layersWithConstrains = {"memory", "convolution"}; return layersWithConstrains.find(name) != layersWithConstrains.end(); } + size_t getOutputBatchSize() const { + if (!layer) { + THROW_GNA_EXCEPTION << "layer is null"; + } + if (!layer->outData[0]) { + THROW_GNA_EXCEPTION << "output data of layer '" << layer->name << "' is null"; + } + auto& dims = layer->outData[0]->getDims(); + auto layout = layer->outData[0]->getLayout(); + switch (dims.size()) { + case 1: + return 1; + case 2: + if (layout == InferenceEngine::Layout::NC) { + return dims[0]; + } else if (layout == InferenceEngine::Layout::CN) { + return dims[1]; + } else { + THROW_GNA_EXCEPTION << "batch size is not define in layer '" << layer->name << "'"; + } + case 4: + return dims[0]; + default: + THROW_GNA_EXCEPTION << "batch size is not define in layer '" << layer->name << "'"; + } + } bool isActivation() const noexcept { IS_VALID(); static InferenceEngine::details::caseless_set activations = diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_type.cpp b/inference-engine/src/gna_plugin/layers/gna_layer_type.cpp index a333d47c48aac3..7a3de49a2de052 100644 --- a/inference-engine/src/gna_plugin/layers/gna_layer_type.cpp +++ b/inference-engine/src/gna_plugin/layers/gna_layer_type.cpp @@ -15,52 +15,3 @@ GNAPluginNS::LayerType GNAPluginNS::LayerTypeFromStr(const std::string &str) { else return NO_TYPE; } - -bool GNAPluginNS::AreLayersSupported(InferenceEngine::CNNNetwork& network, std::string& errMessage) { - IE_SUPPRESS_DEPRECATED_START - InferenceEngine::InputsDataMap inputs = network.getInputsInfo(); - std::unordered_set allLayers; - InferenceEngine::CNNLayerPtr startLayer; - if (inputs.empty()) { - auto outputs = network.getOutputsInfo(); - IE_ASSERT(!outputs.empty()); - // If there are no inputs start search from an output - startLayer = getCreatorLayer(outputs.begin()->second).lock(); - } else { - auto network_input_precision = inputs.begin()->second->getPrecision(); - - if (network_input_precision != InferenceEngine::Precision::FP32 && - network_input_precision != InferenceEngine::Precision::I16 && - network_input_precision != InferenceEngine::Precision::U8) { - errMessage = "The plugin does not support input precision with " + - std::string(network_input_precision.name()) + - " format. Supported input precisions FP32, I16, U8\n"; - return false; - } - - auto & secondLayers = getInputTo(inputs.begin()->second->getInputData()); - if (secondLayers.empty()) { - errMessage = "Network consists of input layer only (GNA)\n"; - return false; - } - startLayer = secondLayers.begin()->second; - } - auto batch_size = network.getBatchSize(); - - bool check_result = true; - InferenceEngine::details::UnorderedDFS(allLayers, - startLayer, - [&](const InferenceEngine::CNNLayerPtr layer) { - if (LayerTypeFromStr(layer->type) == LayerType::NO_TYPE) { - errMessage = "The plugin does not support layer: " + layer->name + ":" + layer->type + "\n"; - check_result = false; - } - if (batch_size != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) { - errMessage = "topology with layer: " + layer->name + ", type: " + layer->type + - ", and batch size(" + std::to_string(batch_size) + ") != 1 not supported"; - check_result = false; - } - }, false); - IE_SUPPRESS_DEPRECATED_END - return check_result; -} diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_type.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_type.hpp index 7fdbf7e576d24b..266590dba68e43 100644 --- a/inference-engine/src/gna_plugin/layers/gna_layer_type.hpp +++ b/inference-engine/src/gna_plugin/layers/gna_layer_type.hpp @@ -91,5 +91,4 @@ static const InferenceEngine::details::caseless_map #include #include @@ -41,6 +40,7 @@ #include "gna_graph_patterns.hpp" #include "gna_data_types.hpp" #include "gna_tensor_tools.hpp" +#include "gna_itt.hpp" using namespace InferenceEngine; using namespace InferenceEngine::details; @@ -85,9 +85,8 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer, return LayerInfo(ptr).isNonValuesChangable(); }); IE_ASSERT(inputLayer != nullptr); - size_t weightsSize = (LayerInfo(prevLayer).has32BOutput() || LayerInfo(inputLayer).isInput()) ? - weightsSize = nextLayer->outData[0]->getDims().back() : - Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1]; + size_t weightsSize = LayerInfo(prevLayer).has32BOutput() ? nextLayer->outData[0]->getDims().back() : + Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1]; std::vector weightsValues(weightsSize, fillValue); IE_ASSERT(diagLayer != nullptr); diagLayer->_weights = make_shared_blob( @@ -113,6 +112,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer, */ static CNNLayerPtr InsertCopyLayer(CNNLayerPtr prevLayer, CNNLayerPtr nextLayer, int beforeIdx, std::shared_ptr passmanager, std::string copyLayerType) { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertCopyLayer"); auto quantized = InferenceEngine::getInjectedData(prevLayer); std::string copyName = copyLayerType + std::string("_") + std::to_string(passmanager->getIntVar(copyLayersCounter)++); gnalog() << "Inserted " << copyName << " between: " << prevLayer->name << " and " << nextLayer->name << std::endl; @@ -258,6 +258,7 @@ static std::vector getCandidatesForIdentityInsertion(const CNNLayer } void InsertDiagonalLayerPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertDiagonalLayerPass"); bool lowPrecision = getPassManager()->isLowPrecision(); for (auto & l : *pLayers) { @@ -305,6 +306,7 @@ void InsertDiagonalLayerPass::run() { } void HandleMultipleActivationsForTheLayerPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "HandleMultipleActivationsForTheLayerPass"); // found layer followed by multiple activations for (auto & l : *pLayers) { CNNLayerSet activations; @@ -314,6 +316,7 @@ void HandleMultipleActivationsForTheLayerPass::run() { LayerInfo info(inputTo.second); if (info.isActivation()) { + if (odata->getDims().empty()) continue; if (!activations.empty() && odata->getDims()[0] != 1) { THROW_GNA_EXCEPTION << "Unsupported batch size " << odata->getDims()[0] << " for diagonal layer insertion"; @@ -333,6 +336,7 @@ void HandleMultipleActivationsForTheLayerPass::run() { } void ForbidActivationFusingPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "ForbidActivationFusingPass"); for (auto& l : *pLayers) { if (LayerInfo(l).isActivation()) { auto prevLayer = CNNNetPrevLayer(l); @@ -370,6 +374,7 @@ namespace { } // namespace void ReorderMaxPoolPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "ReorderMaxPoolPass"); // detecting following pattern // conv->activation->maxpooling // changing it to conv->maxpooling->activation @@ -398,6 +403,7 @@ void ReorderMaxPoolPass::run() { } void SubstituteSoftSignPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "SubstituteSoftSignPass"); //detecting following pattern // irv7 model: irv10 model: // a layer a layer @@ -501,6 +507,7 @@ void SubstituteSoftSignPass::run() { } } void SubstitutePReluPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "SubstitutePReluPass"); auto getScale = [](CNNLayer* layer) { auto powerCandidate = LayerInfo(layer); if (!powerCandidate.isPower()) return 0.0f; @@ -605,99 +612,8 @@ void SubstitutePReluPass::run() { } } -void ReversePermutationsPass::run() { - std::function)> prevLayerSkipCertain - = [&prevLayerSkipCertain](CNNLayerPtr layer, std::function shouldSkip) -> CNNLayerPtr { - if (CNNNetHasPrevLayer(layer.get())) { - return nullptr; - } - auto prev = CNNNetPrevLayer(layer); - - if (!shouldSkip(prev)) return prevLayerSkipCertain(prev, shouldSkip); - - return prev; - }; - - std::function nextLayerSkipReshape = [&nextLayerSkipReshape](CNNLayerPtr layer) -> CNNLayerPtr { - if (layer->outData.empty()) { - return nullptr; - } - if (getInputTo(layer->outData.front()).size() != 1) { - return nullptr; - } - auto next = getInputTo(layer->outData.front()).begin()->second; - - if (LayerInfo(next).isNonFunctional()) return nextLayerSkipReshape(next); - - return next; - }; - - auto prevConv = [&prevLayerSkipCertain](CNNLayerPtr layer) -> CNNLayerPtr { - return prevLayerSkipCertain(layer, [] (CNNLayerPtr l2) { - return - LayerInfo(l2).isNonFunctional() || - LayerInfo(l2).isPooling() || - LayerInfo(l2).isActivation(); - }); - }; - - std::unordered_set affineWithPermutedWeights; - std::list permutationstoRemove; - - for (auto & l : *pLayers) { - if (!LayerInfo(l).isPermute()) { - continue; - } - - auto layerOrder = l->GetParamAsInts("order"); - - if (layerOrder != std::vector({0, 3, 2, 1})) { - THROW_GNA_EXCEPTION << "Unsupported permute layer: " << l->name << ", order: was " << l->GetParamAsString("order") << - ", but support order is 0,3,2,1"; - } - - // search for it's input convolution - auto prev = prevConv(l); - - // pooling no used in speech models without convolution - if (!prev) { - THROW_GNA_EXCEPTION << "Unsupported permute layer: " << l->name << " no valid input to that layer"; - } - - // we can remove that permutation if it is input to ScaleShift or FC layer - auto next = nextLayerSkipReshape(l); - if (!next || !LayerInfo(next).isFullyConnected()) { - THROW_GNA_EXCEPTION << "Unsupported permute layer: " << l->name << " no valid output of that layer"; - } - - permutationstoRemove.push_back(l); - - // removing that permutation layer and saving information about affine - affineWithPermutedWeights.insert(next->name); - } - - for (auto && toRemove : permutationstoRemove) { - CNNNetworkRemoveLayer(toRemove); - } - - // search for conv->affine sequences - for (auto & l : *pLayers) { - if (!LayerInfo(l).isFullyConnected() || 0 != affineWithPermutedWeights.count(l->name)) { - continue; - } - // found an affine layer that not involved in permutations removing - // searching whether it has direct input from convolution - auto prevConvLayer = prevConv(l); - if (!prevConvLayer) continue; - - auto directPrev = CNNNetPrevLayer(l); - - // TODO : make new permute - CNNNetworkInsertLayer(l, directPrev, CNNLayerPtr(nullptr)); - } -} - void RemovePermutationsNHWCToNCHWPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "RemovePermutationsNHWCToNCHWPass"); std::set permutations_to_remove; std::list> nhwc_layout_patterns; for (auto& l : *pLayers) { @@ -710,7 +626,7 @@ void RemovePermutationsNHWCToNCHWPass::run() { if (prev == nullptr || next == nullptr) continue; - if (LayerInfo(prev).isPermute() && getPassManager()->getPolicy().NHWCToNCHWPolicy == Policy::NHWCToNCHW::REMOVE_ALL) { + if (LayerInfo(prev).isPermute()) { permutations_to_remove.insert(prev); } @@ -741,12 +657,17 @@ void RemovePermutationsNHWCToNCHWPass::run() { IE_ASSERT(!input_to.empty()); auto current_layer = input_to.begin()->second; setNHWCOrder(current_layer->input()); - while (current_layer != pattern_end) { - setNHWCOrder(current_layer->outData[0]); - input_to = getInputTo(current_layer->outData[0]); - IE_ASSERT(!input_to.empty()); - current_layer = input_to.begin()->second; - } + std::function propogateNHWCOrderRecursive = + [pattern_end, &propogateNHWCOrderRecursive, &setNHWCOrder](CNNLayerPtr current_layer) { + if (current_layer == pattern_end) return; + for (size_t i = 0; i < current_layer->outData.size(); ++i) { + setNHWCOrder(current_layer->outData[i]); + auto input_to = getInputTo(current_layer->outData[i]); + IE_ASSERT(!input_to.empty()); + propogateNHWCOrderRecursive(input_to.begin()->second); + } + }; + propogateNHWCOrderRecursive(current_layer); if (LayerInfo(pattern_start).isPermute() && !getInputTo(pattern_start->outData.front()).empty()) { auto layer_before_permute = CNNNetPrevLayer(pattern_start); @@ -776,6 +697,7 @@ void RemovePermutationsNHWCToNCHWPass::run() { } void InsertIdentityLayerPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertIdentityLayerPass"); auto quantized = InferenceEngine::getInjectedData(pLayers->front()); auto createIdentityLayer = [quantized, this](const TensorDesc& tensorDesc) { int numOfIdentityLayers = this->getPassManager()->getIntVar(identityLayersCounterName)++; @@ -893,6 +815,7 @@ void InsertIdentityLayerPass::run() { } void InsertCopyLayerPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertCopyLayerPass"); // Copy layer insertion happens in few cases: // Crop output goes to concat layer -> copy layer insertion // Splitted part of input goes to concat layer -> copy layer insertion @@ -1015,6 +938,7 @@ void InsertCopyLayerPass::run() { } void FlattenTrivialConcatPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "FlattenTrivialConcatPass"); // change all trivial concatenations (concatenation where output buffer is a buffer made by appending input buffers) // by reshaping its inputs to 1 x total_input_size and its output to 1 x total_cocat_size and chaning the axis to 1 // for example if 4D concat have unaligned inputs then ConcatAlignFilters need to be used if sizes before @@ -1022,9 +946,6 @@ void FlattenTrivialConcatPass::run() { // 1, 1, 5, 3 then for axis 0, 1, 2 the change will be made and inputs will be reshaped to 1, 15, // but for shape 2, 1, 5, 3 only axis 0 is valid and inputs will reshape to 1, 30 auto quantized = InferenceEngine::getInjectedData(pLayers->front()); - if (getPassManager()->getPolicy().ConcatConversionPolicy == Policy::FlattenTrivialConcatConversion::DISABLED) return; - if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED) return; - if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED_FOR_FP32 && !quantized) return; auto getLayerByIndex = [](int idx, ConcatLayer* concatLayer) { auto input = concatLayer->insData[idx]; @@ -1098,15 +1019,8 @@ void FlattenTrivialConcatPass::run() { } void InsertConcatAligningFilterPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertConcatAligningFilterPass"); auto quantized = InferenceEngine::getInjectedData(pLayers->front()); - - if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED) { - return; - } - // aligning specific not required in fp32 mode - if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED_FOR_FP32 && !quantized) { - return; - } // currently concat layer only supports 2 bytes in int16 and int8 mode. In fp32 mode this no necessary but usefull for testing const int bytesPerConcatElement = 2; @@ -1197,6 +1111,13 @@ void InsertConcatAligningFilterPass::run() { // modifying output rows to be used - to avoid modification to original concat we are store num of elements in params dims[1] = num_rows_out; + if ((concatInput->getLayout() == Layout::NC && dims[0] > 8) || + (concatInput->getLayout() == Layout::CN && dims[1] > 8)) { + THROW_GNA_EXCEPTION << "unsupported batch number '" << + (concatInput->getLayout() == Layout::NC ? dims[0] : dims[1]) << + "' in layer '" << concatLayer->name << "'"; + } + auto outData = std::make_shared(filterName, TensorDesc(concatInput->getPrecision(), dims, @@ -1216,11 +1137,8 @@ void InsertConcatAligningFilterPass::run() { } void ReorderConcatInputsPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "ReorderConcatInputsPass"); auto quantized = InferenceEngine::getInjectedData(pLayers->front()); - // aligning specific not required in fp32 mode - if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED_FOR_FP32 && !quantized) { - return; - } int numOfLinkLayers = 0; for (auto& l : *pLayers) { @@ -1313,6 +1231,7 @@ void ReorderConcatInputsPass::run() { } void InsertSplitAligningFilterPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertSplitAligningFilterPass"); // currently split layer only supports 2 bytes in int16 and int8 mode. In fp32 mode this is not necessary but is useful for testing const int bytesPerSplitElement = 2; auto quantized = InferenceEngine::getInjectedData(pLayers->front()); @@ -1432,9 +1351,7 @@ static InferenceEngine::Blob::Ptr tileBlob(Blob::Ptr& blob, size_t TileTo) { } void EltwiseSplitOverChannelsPass::run() { - if (getPassManager()->getPolicy().GNAAffineDiagonalPolicy.limitedTo == Policy::GNAAffineDiagonal::UNLIMIT) { - return; - } + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "EltwiseSplitOverChannelsPass"); for (auto & l : *pLayers) { if (!LayerInfo(l).isEltwise()) { @@ -1447,21 +1364,20 @@ void EltwiseSplitOverChannelsPass::run() { THROW_GNA_LAYER_EXCEPTION(l) << "number of outputs expected to be 1"; } auto oData = l->outData.front(); + auto out_width = GetDataDimSize(oData, DataDimName::W); auto totalElementsForOutput = details::product(oData->getDims().begin(), oData->getDims().end()); - auto maxAffineElements = getPassManager()->getPolicy().GNAAffineDiagonalPolicy.limitedTo; + // gna limit this to be OxFFFF + auto maxAffineElements = 65536 - 64; if (totalElementsForOutput <= maxAffineElements) { continue; } - // TODO: for now lets put split of 2 elements as restrictions auto totalSplits = 1 + totalElementsForOutput / maxAffineElements; - if (totalSplits > 2) { - THROW_GNA_LAYER_EXCEPTION(l) << "split layer over output channels on more than 2 layers unsupported"; - } pass_trace() << "transforming " << LAYER_NAME(l) << " by splitting it to multiple eltwise operations\n"; auto quantized = InferenceEngine::getInjectedData(l); + bool sameInputs = l->insData[0].lock() == l->insData[1].lock(); std::vector splitLayers(2); for (size_t kThEltwiseInput = 0; kThEltwiseInput != 2; kThEltwiseInput++) { // create split layer @@ -1472,31 +1388,38 @@ void EltwiseSplitOverChannelsPass::run() { split->insData.push_back(l->insData[kThEltwiseInput]); auto inputDesc = l->insData[kThEltwiseInput].lock()->getTensorDesc(); - // need to split this desc - if (inputDesc.getLayout() != Layout::NC) { - THROW_GNA_LAYER_EXCEPTION(l) - << "cannot split over channel: input " << std::to_string(kThEltwiseInput) - << " layout need to be NC"; - } // create split layer outputs - for (size_t i = 0;; i++) { - auto elements_num = std::min(totalElementsForOutput - i * maxAffineElements, + size_t usedElements = 0; + for (size_t i = 0; i < totalSplits; i++) { + SizeVector newDims; + size_t elements_num = std::min(totalElementsForOutput - usedElements, static_cast(maxAffineElements)); + if (inputDesc.getDims().size() == 2) { + newDims = SizeVector{1, elements_num}; + } else { + elements_num = elements_num - elements_num % out_width; + newDims = SizeVector{1, elements_num / out_width, out_width}; + } - SizeVector newDims = {1, elements_num}; auto newDesc = TensorDesc(inputDesc.getPrecision(), newDims, inputDesc.getLayout()); auto data = std::make_shared(l->name + "/" + std::to_string(kThEltwiseInput) + "/1", newDesc); getCreatorLayer(data) = split; split->outData.push_back(data); - if (elements_num != maxAffineElements) { + usedElements += elements_num; + if (usedElements == totalElementsForOutput) { break; } } // replacing connection X->eltwise to X->split auto oData = CNNLayerFindOutData(l, kThEltwiseInput); oData.second->second = split; + + if (sameInputs) { + splitLayers[1] = splitLayers[0]; + break; + } } // create concatlayer @@ -1507,8 +1430,6 @@ void EltwiseSplitOverChannelsPass::run() { concat->outData.push_back(masterEltwise->outData.front()); getCreatorLayer(masterEltwise->outData.front()) = concat; - - // create new eltwise layers - here 2 hardcode for (size_t k = 0; k != totalSplits; k++) { auto eltwiseRaw = std::make_shared( LayerParams{l->name + "/eltwise/" + std::to_string(k), "Eltwise", Precision::FP32}); @@ -1517,7 +1438,6 @@ void EltwiseSplitOverChannelsPass::run() { eltwiseRaw->coeff = masterEltwise->coeff; auto eltwise = quantized ? InferenceEngine::injectData(eltwiseRaw) : eltwiseRaw; - eltwise->insData.push_back(splitLayers[0]->outData[k]); eltwise->insData.push_back(splitLayers[1]->outData[k]); getInputTo(splitLayers[0]->outData[k])[eltwise->name] = eltwise; @@ -1529,6 +1449,15 @@ void EltwiseSplitOverChannelsPass::run() { auto data = std::make_shared(l->name + "/elwise/out/" + std::to_string(k), newDesc); getCreatorLayer(data) = eltwise; eltwise->outData.push_back(data); + if (quantized) { + auto eltwiseQuant = InferenceEngine::getInjectedData(eltwise); + if (quantized->_src_quant.IsStatsSet()) { + eltwiseQuant->_src_quant.CopyStats(quantized->_src_quant); + } + if (quantized->_dst_quant.IsStatsSet()) { + eltwiseQuant->_dst_quant.CopyStats(quantized->_dst_quant); + } + } getInputTo(data)[concat->name] = concat; concat->insData.push_back(data); } @@ -1536,6 +1465,7 @@ void EltwiseSplitOverChannelsPass::run() { } void SubstituteScaleShiftBroadCastPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "SubstituteScaleShiftBroadCastPass"); std::map reshaped_data; auto quantized = InferenceEngine::getInjectedData(pLayers->front()); @@ -1588,35 +1518,30 @@ void SubstituteScaleShiftBroadCastPass::run() { } gnalog() << "Substitution ScaleShift broadcast for layer: " << l->name << "\n"; - // approach 1 - weights tiling - if (getPassManager()->getPolicy().ScaleShiftPolicy == Policy::ScaleShift::WEIGHTS_TILING) { - if (nElements % scaleShift->_weights->size()) { - THROW_GNA_EXCEPTION << "Cannot tile weights for layer: " << l->name << ", due to weights size not GCD of dims product"; - } - scaleShift->_weights = tileBlob(scaleShift->_weights, nElements); - if (scaleShift->_biases) { - if (nElements % scaleShift->_biases->size()) { - THROW_GNA_EXCEPTION << "Cannot tile biases for layer: " << l->name << ", due to biases size not GCD of dims product"; - } - scaleShift->_biases = tileBlob(scaleShift->_biases, nElements); + if (nElements % scaleShift->_weights->size()) { + THROW_GNA_EXCEPTION << "Cannot tile weights for layer: " << l->name << ", due to weights size not GCD of dims product"; + } + scaleShift->_weights = tileBlob(scaleShift->_weights, nElements); + if (scaleShift->_biases) { + if (nElements % scaleShift->_biases->size()) { + THROW_GNA_EXCEPTION << "Cannot tile biases for layer: " << l->name << ", due to biases size not GCD of dims product"; } + scaleShift->_biases = tileBlob(scaleShift->_biases, nElements); + } - auto tensor = InferenceEngine::TensorDesc(insData->getTensorDesc()); - tensor.reshape(SizeVector{ batchSize, nElements }, Layout::NC); - auto reshapeName = scaleShift->name + "_input_" + std::to_string(0) + "_reshape"; - auto reshape = CNNNetworkCreateReshape(tensor, reshapeName, quantized); - auto layer_before_scale_shift = getCreatorLayer(insData); + auto tensor = InferenceEngine::TensorDesc(insData->getTensorDesc()); + tensor.reshape(SizeVector{ batchSize, nElements }, Layout::NC); + auto reshapeName = scaleShift->name + "_input_" + std::to_string(0) + "_reshape"; + auto reshape = CNNNetworkCreateReshape(tensor, reshapeName, quantized); + auto layer_before_scale_shift = getCreatorLayer(insData); - CNNNetworkInsertLayer(layer_before_scale_shift.lock(), l, reshape); - gnalog() << "\tInserted " << reshapeName << " between " << layer_before_scale_shift.lock()->name << " and " << l->name << std::endl; - } else { - THROW_GNA_EXCEPTION << "Not implemented substitution of scaleshift broadcast policy of " - << getPassManager()->getPolicy().ScaleShiftPolicy << "using layers tiling, layer: " << l->name; - } + CNNNetworkInsertLayer(layer_before_scale_shift.lock(), l, reshape); + gnalog() << "\tInserted " << reshapeName << " between " << layer_before_scale_shift.lock()->name << " and " << l->name << std::endl; } } void BroadcastConstPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "BroadcastConstPass"); for (auto constLayer : *pLayers) { if (!LayerInfo(constLayer).isConst()) { continue; @@ -1627,7 +1552,7 @@ void BroadcastConstPass::run() { }; auto nextLayer = CNNNetCheckNextLayerSkipCertain(constLayer, 0, 0, true, isNonFunctional).first; - if (!nextLayer || !LayerInfo(nextLayer).isEltwise() && !LayerInfo(nextLayer).isFakeQuantize()) { + if (!nextLayer || (!LayerInfo(nextLayer).isEltwise() && !LayerInfo(nextLayer).isFakeQuantize())) { continue; } @@ -1669,6 +1594,7 @@ void BroadcastConstPass::run() { } void InsertIdentityToLSTMCellPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertIdentityToLSTMCellPass"); for (auto layer : *pLayers) { if (layer->type == "LSTMCell") { // This fixed the cases when both functional and non-functional outputs are mixed (or not outputs are used) @@ -1706,6 +1632,7 @@ void InsertIdentityToLSTMCellPass::run() { } void BreakFusingOfOutputLayersPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "BreakFusingOfOutputLayersPass"); #if GNA_LIB_VER == 1 return; #endif @@ -1749,6 +1676,7 @@ void BreakFusingOfOutputLayersPass::run() { } void UnrollLSTMCellPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UnrollLSTMCellPass"); InferenceEngine::NetPass::UnrollRNN_if(getPassManager()->getNetwork(), [] (const RNNCellBase& rnn) -> bool { if (rnn.clip != 0.0f) return true; @@ -1765,6 +1693,7 @@ void UnrollLSTMCellPass::run() { } void UnrollTIPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UnrollTIPass"); auto sts = InferenceEngine::NetPass::UnrollTI(getPassManager()->getNetwork()); if (!sts) { THROW_GNA_EXCEPTION << "TensorIterator layer cannot be unrolled!"; @@ -1772,6 +1701,7 @@ void UnrollTIPass::run() { } void RemoveConstPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "RemoveConstPass"); auto network = getPassManager()->getNetwork(); IE_SUPPRESS_DEPRECATED_START auto & icnnnet = static_cast(network); @@ -1785,6 +1715,7 @@ void RemoveConstPass::run() { } void RemoveSingleInputConcatPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "RemoveSingleInputConcatPass"); for (auto &l : *pLayers) { if (l->type == "Concat") { auto concat = dynamic_cast(l.get()); @@ -1812,6 +1743,7 @@ void RemoveSingleInputConcatPass::run() { } void FuseMultipleIdentitiesPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "FuseMultipleIdentitiesPass"); for (auto &l : *pLayers) { if (l->insData.empty()) continue; @@ -1893,6 +1825,7 @@ void FuseMultipleIdentitiesPass::run() { } void FuseFQIntoWeightsPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "FuseFQIntoWeightsPass"); auto isNonFunctional = [](CNNLayerPtr ptr) { return LayerInfo(ptr).isNonFunctional(); }; @@ -1919,13 +1852,20 @@ void FuseFQIntoWeightsPass::run() { } GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer); - size_t layers_connected_to_fq_count = getInputTo(fqLayer->outData[0]).size(); + auto inputTo = getInputTo(fqLayer->outData[0]); + size_t layers_connected_to_fq_count = inputTo.size(); + auto layerBeforeWeightable = fqLayer; + while (layers_connected_to_fq_count == 1 && LayerInfo(inputTo.begin()->second).isNonFunctional()) { + layerBeforeWeightable = inputTo.begin()->second; + inputTo = getInputTo(layerBeforeWeightable->outData[0]); + layers_connected_to_fq_count = inputTo.size(); + } for (int index = 0; index < layers_connected_to_fq_count; index++) { - auto weightableLayer = CNNNetGetNextLayerSkipCertain(fqLayer, 0, index, isNonFunctional).first; + auto weightableLayer = CNNNetGetNextLayerSkipCertain(layerBeforeWeightable, 0, index, isNonFunctional).first; if (!LayerInfo(weightableLayer).isWeightable()) { continue; } - if (weightableLayer->insData.size() != 3) { + if (weightableLayer->insData.size() < 2) { continue; } @@ -1942,7 +1882,8 @@ void FuseFQIntoWeightsPass::run() { pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of " << LAYER_NAME(weightableLayer) << "\n"; - auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx); + auto biases = weightableLayer->insData.size() == 3 ? + LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx) : nullptr; auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData(); // 1. broke existing connections - by detaching fq subgraph from rest of graph @@ -2043,6 +1984,7 @@ void FuseFQIntoWeightsPass::run() { } void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "MoveFakeQuantizeLayerIntoQuantParamsPass"); auto quantized = InferenceEngine::getInjectedData(pLayers->front()); if (!quantized) { return; @@ -2149,8 +2091,11 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { } GNAFakeQuantizeLayer fqLayer(l); auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip); - if (prevLayer->outData.size() != 1) { - THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported"; + auto prevDataIt = std::find_if(std::begin(prevLayer->outData), std::end(prevLayer->outData), [l](DataPtr data) { + return getInputTo(data).find(l->name) != std::end(getInputTo(data)); + }); + if (prevDataIt == std::end(prevLayer->outData)) { + THROW_GNA_LAYER_EXCEPTION(fqLayer) << "Invalid connection between " << prevLayer->name << " and " << l->name; } auto inputRange = fqLayer.getInputRange(); @@ -2181,8 +2126,18 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { quantParamsPrevLayer->_dst_quant.SetMinValues({ outputRange.first[0] }, false); quantParamsPrevLayer->_dst_quant.SetMaxValues({ outputRange.second[0] }, false); + // Propogate destination statistics to multiply layer if it's set for the next sum/sub layer (is considered as bias) + if (LayerInfo(prevLayer).isEltwiseSum() || LayerInfo(prevLayer).isEltwiseSub()) { + auto eltwPrevLayer = CNNNetPrevLayerSkipCertain(prevLayer, 0, donotSkip); + auto constLayer = CNNNetPrevLayerSkipCertain(prevLayer, 1, donotSkip); + if (LayerInfo(eltwPrevLayer).isEltwise() && LayerInfo(constLayer).isConst()) { + auto quantParamsEltwLayer = InferenceEngine::getInjectedData(eltwPrevLayer); + quantParamsEltwLayer->_dst_quant.CopyStats(quantParamsPrevLayer->_dst_quant); + } + } + auto fqQauntParams = InferenceEngine::getInjectedData(l); - fqQauntParams->_dst_quant.SetLevels(fqLevels); + fqQauntParams->_dst_quant.SetLevels(UINT16_MAX); fqQauntParams->_dst_quant.SetMinValues({ inputRange.first[0] }, true); fqQauntParams->_dst_quant.SetMaxValues({ inputRange.second[0] }, true); fqQauntParams->_dst_quant.SetMinValues({ outputRange.first[0] }, false); @@ -2198,7 +2153,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { // FQ Layer is fused only when previous layer is const, memory or activation layer // or a next layer is activation layer. bool isFQFuseAllowed = allowFQFuse(l); - auto prevData = prevLayer->outData.front(); + auto prevData = *prevDataIt; // Find all output layers connected to FQ auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip); @@ -2207,7 +2162,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { } if (isFQFuseAllowed) { - getInputTo(prevLayer->outData.front()).clear(); + getInputTo(prevData).clear(); } // Connect all next layers after FQ to the layer that is before FQ @@ -2222,7 +2177,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { for (int insDataIdx : insDatas) { nextLayers[i]->insData[insDataIdx] = prevData; } - getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i]; + getInputTo(prevData)[nextLayers[i]->name] = nextLayers[i]; } propagateStatistics(quantParamsPrevLayer, nextLayers[i]); @@ -2231,6 +2186,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { } void TransposeWeightsFromNCHWToNHWCPass::run() { + OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "TransposeWeightsFromNCHWToNHWCPass"); if (!MustBeConvertedFromNCHWToNHWC(*pLayers)) return; auto printTranspositionInfo = [](const std::vector &transpositionInfo) { diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp index af98288396e8c4..97e1e942e5c844 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp @@ -8,7 +8,6 @@ #include #include #include -#include "gna_plugin_policy.hpp" namespace GNAPluginNS { /** @@ -29,7 +28,6 @@ class IPassManager { public: virtual ~IPassManager() = default; virtual int &getIntVar(std::string name) = 0; - virtual const Policy &getPolicy() const = 0; virtual const bool& isLowPrecision() const = 0; virtual InferenceEngine::CNNNetwork &getNetwork() = 0; }; @@ -75,17 +73,6 @@ DECL_PASS(InsertIdentityLayer); */ DECL_PASS(SubstituteScaleShiftBroadCast); -/** - * @brief GNA convolution layers have deinterleaved layout, while affine one doesn't - * so between convolution and affine layers permute layers need to be inserted, - * current MO approach is to insert such permutations - * since GNA-HW already support conv->affine in permuted for, this pass inverses MO behavior - * so its remove permutations of certain form conv->conv, and between conv->affine - * and insert permutation between conv->affine if they are missed in IR - * @param layers - */ -DECL_PASS(ReversePermutations); - /** * @brief Pass support --disable_nhwc_to_nchw option in MO * @param layers @@ -220,7 +207,6 @@ DECL_PASS(MoveFakeQuantizeLayerIntoQuantParams); DECL_PASS(TransposeWeightsFromNCHWToNHWC); struct PassManagerSettings { - Policy policy; /// @brief whether to run passes before copy bool runBeforeCopy; bool lowPrecision; @@ -245,9 +231,6 @@ class PassManager : public IPassManager, public std::enable_shared_from_this +#include +#include "backend/dnn_types.h" +#include "serial/headers/2dot4/gna_model_header.hpp" +#include "serial/headers/2dot6/gna_model_header.hpp" +#include "serial/headers/latest/gna_model_header.hpp" +#include "gna_data_types.hpp" + +#pragma pack(push, 1) + +namespace GNAPluginNS { +namespace Header2dot7 { + +/** + Maximal number of supported shape dimensions. + */ +#define GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS 8 + +/** + * @brief Header version 2.7 + */ +struct ModelHeader { + /** + *@brief MagicNumber – GNAM in ascii table, equals to hex 0x474e414d + */ + char gnam[4] = {}; + /** + * @brief if header size is not equal to sizeof ModelHeader - some reserved data append in the end of header + * usually it is an indicator of working with version of model different that is current export function produce + */ + uint32_t headerSize = 0u; + struct Version { + /** + * @details Version of format Major – unsigned int, ex: 0x0001 + * every change in the header or in the layers definition should be reflected in version change + * for backward compatibility new parsers can read old versions of model with certain restrictions + */ + uint16_t major = 2u; + /** + * @details Version of Format Minor – unsigned int, corresponding to build revision for example + * changes in minor version are not affected layout of model + */ + uint32_t minor = 7u; + } version; + /** + * @brief Memory required to be allocated using GNAAlloc() + */ + uint64_t gnaMemSize = 0ull; + /** + * @brief Number of GNA Layers + */ + uint64_t layersCount = 0ull; + /** + * @brief Grouping level + * This is depricted field and used for old models only (<=2.6) + */ + uint32_t nGroup = 0u; + + /** + * Convolution related setting - they are affecting input transformation + */ + uint32_t nRotateRows = 0u; + uint32_t nRotateColumns = 0u; + bool doRotateInput = false; + + uint32_t nInputs = 0u; + uint32_t nOutputs = 0u; + + /** + * Convolution related setting - they are affecting output transformation + */ + uint32_t nRotateOutputRows = 0u; + uint32_t nRotateOutputColumns = 0u; + bool doRotateOutput = false; + + uint32_t nTransposeInputs = 0u; + uint32_t nTransposeOutputs = 0u; + + /** + * Reserved Data might be here + */ + ModelHeader() = default; + ModelHeader(GNAPluginNS::Header2dot1::ModelHeader const &old) { + gnaMemSize = old.gnaMemSize; + layersCount = old.layersCount; + nGroup = old.nGroup; + nRotateRows = old.nRotateRows; + nRotateColumns = old.nRotateColumns; + nInputs = old.nInputs; + nOutputs = old.nOutputs; + version.minor = old.version.minor; + } + ModelHeader(GNAPluginNS::Header2dot4::ModelHeader const &old) { + gnaMemSize = old.gnaMemSize; + layersCount = old.layersCount; + nGroup = old.nGroup; + nRotateRows = old.nRotateRows; + nRotateColumns = old.nRotateColumns; + nInputs = old.nInputs; + nOutputs = old.nOutputs; + nRotateOutputRows = old.nRotateOutputRows; + nRotateOutputColumns = old.nRotateOutputColumns; + doRotateOutput = old.doRotateOutput; + version.minor = old.version.minor; + } +}; +#pragma pack(pop) + +/* + * In runtime endpoint mostly same as in serial version, except of descriptor field + */ +struct RuntimeEndPoint { + /** + * if scale factor is different then pased into infer , network might need to be requantized + */ + float scaleFactor = 0; + /** + * Pointer descriptor + */ + void* descriptor_ptr = nullptr; + /** + * Endpoint resolution in bytes. + */ + uint32_t element_size = 0; + /** + * Number of elements + */ + uint32_t elements_count = 0; + /** + * Offset in bytes of pointer descriptor + */ + uint64_t descriptor_offset = 0ull; + /** + Shape specifying dimension values. + */ + struct Shape { + /** + Number of dimensions or rank or order. + */ + uint32_t NumberOfDimensions = 0; + /** + array specifying value of each dimension. + Set all zeros for scalars. + */ + uint32_t Dimensions[GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS] = {0}; + } shape; + /** + * Blob layout + */ + uint8_t layout = InferenceEngine::Layout::NC; + /** + * Blob precision + */ + uint8_t precision = InferenceEngine::Precision::FP32; + + intel_dnn_orientation_t orientation = kDnnUnknownOrientation; + + RuntimeEndPoint() = default; + RuntimeEndPoint(const GNAPluginNS::Header2dot6::RuntimeEndPoint &old, uint32_t ngroup) { + scaleFactor = old.scaleFactor; + descriptor_ptr = old.descriptor_ptr; + element_size = old.element_size; + elements_count = old.elements_count; + orientation = old.orientation; + layout = InferenceEngine::Layout::NC; + precision = InferenceEngine::Precision::FP32; + descriptor_offset = old.descriptor_offset; + InferenceEngine::SizeVector dims = {ngroup, elements_count / ngroup}; + shape.NumberOfDimensions = static_cast(dims.size()); + for (auto i = 0; i < dims.size(); i++) { + shape.Dimensions[i] = dims[i]; + } + } + RuntimeEndPoint(double scaleFactor, + void* descriptor_ptr, + uint32_t element_size, + uint32_t elements_count, + Shape shape, + uint8_t layout, + uint8_t precision, + intel_dnn_orientation_t orientation) : scaleFactor(scaleFactor), + descriptor_ptr(descriptor_ptr), + element_size(element_size), + elements_count(elements_count), + shape(shape), + layout(layout), + precision(precision), + orientation(orientation) { } +}; +} // namespace Header2dot7 +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp b/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp index 89292ab88afb17..7ec27b2caed386 100644 --- a/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp +++ b/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp @@ -4,11 +4,11 @@ #pragma once -#include "serial/headers/2dot6/gna_model_header.hpp" +#include "serial/headers/2dot7/gna_model_header.hpp" namespace GNAPluginNS { namespace HeaderLatest { -using ModelHeader = GNAPluginNS::Header2dot6::ModelHeader; -using RuntimeEndPoint = GNAPluginNS::Header2dot6::RuntimeEndPoint; +using ModelHeader = GNAPluginNS::Header2dot7::ModelHeader; +using RuntimeEndPoint = GNAPluginNS::Header2dot7::RuntimeEndPoint; } } diff --git a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp new file mode 100644 index 00000000000000..e49d95ac2f2271 --- /dev/null +++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp @@ -0,0 +1,184 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include + +#include "transformations/convert_matmul_to_pointwise_convolution.hpp" + +#include +#include +#include + +#include "layers/gna_permute.hpp" +#include "backend/gna_limitations.hpp" + +using namespace GNAPluginNS; + +NGRAPH_RTTI_DEFINITION(ConvertMatmulToPointWiseConvolution, "ConvertMatmulToPointWiseConvolution", 0); +NGRAPH_RTTI_DEFINITION(ConvertMatmulWithBiasToPointWiseConvolution, "ConvertMatmulWithBiasToPointWiseConvolution", 0); +NGRAPH_RTTI_DEFINITION(ConvertMatmulWithFqToPointWiseConvolution, "ConvertMatmulWithFqToPointWiseConvolution", 0); + +static std::tuple VerifyAndGetConvParams(std::shared_ptr matmul_node) { + auto input1_shape = matmul_node->get_input_shape(0); + auto input2_shape = matmul_node->get_input_shape(1); + auto output_shape = matmul_node->get_output_shape(0); + if (input1_shape.size() == 3 && input1_shape.front() == 1) { + input1_shape.erase(std::begin(input1_shape)); + } + + if (input1_shape.size() != 2 || input2_shape.size() != 2 || output_shape.size() < 2) { + return std::make_tuple(false, 0, 0, 0); + } + + // Check if MatMul or corresponding pointwise convolution are supported by GNA + const uint32_t width = input1_shape.front(); + const uint32_t in_channels = input2_shape.back(); + const uint32_t out_channels = input2_shape.front(); + if (input1_shape.front() <= GNALimitations::affineMaxBatchSize || + out_channels % GNALimitations::convFiltersNumDivider != 0 || + out_channels > GNALimitations::convMaxFiltersNum || + in_channels > GNALimitations::convFilterMaxSize) { + return std::make_tuple(false, 0, 0, 0); + } + + return std::make_tuple(true, width, in_channels, out_channels); +} + +static bool Convert(std::shared_ptr matmul_node, + std::shared_ptr add, + std::shared_ptr bias, + std::shared_ptr fq) { + bool supported; + uint32_t width, in_channels, out_channels; + std::tie(supported, width, in_channels, out_channels) = VerifyAndGetConvParams(matmul_node); + if (!supported) return false; + + auto input_node = matmul_node->input_value(0).get_node_shared_ptr(); + auto weights_node = matmul_node->input_value(1).get_node_shared_ptr(); + auto base_name = matmul_node->get_friendly_name(); + + auto reshape_const_before = std::make_shared(ngraph::element::Type_t::i64, + ngraph::Shape{4}, + ngraph::Shape{1, 1, width, in_channels}); + auto reshape_before = std::make_shared(input_node, reshape_const_before, false); + reshape_before->set_friendly_name(base_name + "/reshape_in"); + + auto transpose_before = std::make_shared(reshape_before, + ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4}, + GetPermuteOrder(InferenceEngine::Layout::NHWC, InferenceEngine::Layout::NCHW))); + transpose_before->set_friendly_name(base_name + "/transpose_in"); + + auto weights_reshape_const = std::make_shared(ngraph::element::Type_t::i64, + ngraph::Shape{4}, ngraph::Shape{out_channels, in_channels, 1, 1}); + auto weights_reshaped = std::make_shared(weights_node, weights_reshape_const, false); + + std::shared_ptr conv_node = std::make_shared(transpose_before, weights_reshaped, + ngraph::Strides{1, 1}, ngraph::CoordinateDiff{0, 0}, ngraph::CoordinateDiff{0, 0}, + ngraph::Strides{1, 1}, ngraph::op::PadType::VALID); + conv_node->set_friendly_name(base_name + "/conv"); + + std::shared_ptr root_node = matmul_node; + if (bias != nullptr) { + conv_node = std::make_shared(conv_node, bias); + root_node = add; + } + + if (fq != nullptr) { + conv_node = fq->clone_with_new_inputs({conv_node, fq->input_value(1), fq->input_value(2), + fq->input_value(3), fq->input_value(4)}); + root_node = fq; + } + + auto transpose_after = std::make_shared(conv_node, + ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4}, + GetPermuteOrder(InferenceEngine::Layout::NCHW, InferenceEngine::Layout::NHWC))); + transpose_after->set_friendly_name(base_name + "/transpose_out"); + + auto output_shape = matmul_node->get_output_shape(0); + output_shape[output_shape.size() - 1] = out_channels; + output_shape[output_shape.size() - 2] = width; + auto reshape_const_after = std::make_shared(ngraph::element::Type_t::i64, + ngraph::Shape{output_shape.size()}, + output_shape); + auto reshape_after = std::make_shared(transpose_after, reshape_const_after, false); + reshape_after->set_friendly_name(base_name); + + ngraph::replace_node(root_node, reshape_after); + return true; +} + +ConvertMatmulToPointWiseConvolution::ConvertMatmulToPointWiseConvolution() { + MATCHER_SCOPE(ConvertMatmulToPointWiseConvolution); + auto const_input = ngraph::pattern::wrap_type(); + auto const_fq = ngraph::pattern::wrap_type({const_input, + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type()}); + auto second_input = std::make_shared(ngraph::OutputVector{const_input, const_fq}); + auto matmul = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), second_input}); + + ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) { + const auto& pattern_map = m.get_pattern_value_map(); + return Convert(pattern_map.at(matmul).get_node_shared_ptr(), nullptr, nullptr, nullptr); + }; + + auto m = std::make_shared(matmul, matcher_name); + this->register_matcher(m, callback); +} + +ConvertMatmulWithBiasToPointWiseConvolution::ConvertMatmulWithBiasToPointWiseConvolution() { + MATCHER_SCOPE(ConvertMatmulWithBiasToPointWiseConvolution); + auto const_input = ngraph::pattern::wrap_type(); + auto const_fq = ngraph::pattern::wrap_type({const_input, + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type()}); + auto second_input = std::make_shared(ngraph::OutputVector{const_input, const_fq}); + auto matmul = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), second_input}); + auto bias = ngraph::pattern::wrap_type(); + auto add = ngraph::pattern::wrap_type({matmul, bias}); + + ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) { + const auto& pattern_map = m.get_pattern_value_map(); + return Convert(pattern_map.at(matmul).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(), + pattern_map.at(bias).get_node_shared_ptr(), nullptr); + }; + + auto m = std::make_shared(add, matcher_name); + this->register_matcher(m, callback); +} + +ConvertMatmulWithFqToPointWiseConvolution::ConvertMatmulWithFqToPointWiseConvolution() { + MATCHER_SCOPE(ConvertMatmulWithFqToPointWiseConvolution); + auto const_input = ngraph::pattern::wrap_type(); + auto const_fq = ngraph::pattern::wrap_type({const_input, + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type()}); + auto second_input = std::make_shared(ngraph::OutputVector{const_input, const_fq}); + auto matmul = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), second_input}); + auto bias = ngraph::pattern::wrap_type(); + auto add = ngraph::pattern::wrap_type({matmul, bias}); + auto matmul_out = std::make_shared(ngraph::OutputVector{add, matmul}); + auto out_fq = ngraph::pattern::wrap_type({matmul_out, + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type()}); + + ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto add_it = pattern_map.find(add); + auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr()); + auto bias_it = pattern_map.find(bias); + auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr()); + return Convert(pattern_map.at(matmul).get_node_shared_ptr(), add_node, bias_node, + pattern_map.at(out_fq).get_node_shared_ptr()); + }; + + auto m = std::make_shared(out_fq, matcher_name); + this->register_matcher(m, callback); +} \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp new file mode 100644 index 00000000000000..999b529194d860 --- /dev/null +++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp @@ -0,0 +1,71 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace GNAPluginNS { + +/** + * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout + * with transposes around it: + * Transose (NHWC -> NCHW) + * | + * Matmul Convolution in NHWC layout + * Input1: [A, B] B > 8 -------> Input: [1, 1, A, B] + * Input2: [B, C] Kernel: [C, B, 1, 1] + * Output: [A, C] Output: [1, 1, A, C] + * | + * Transose (NCHW -> NHWC) + */ +class ConvertMatmulToPointWiseConvolution : public ngraph::pass::MatcherPass { +public: + NGRAPH_RTTI_DECLARATION; + ConvertMatmulToPointWiseConvolution(); +}; + +/** + * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout + * with transposes around it, moved add with bias before the last transpose: + * Transose (NHWC -> NCHW) + * | + * Matmul Convolution in NHWC layout + * Input1: [A, B] B > 8 -------> Input: [1, 1, A, B] + * Input2: [B, C] Kernel: [C, B, 1, 1] + * Output: [A, C] Output: [1, 1, A, C] + * | | + * Add (const) Add (const) + * | + * Transose (NCHW -> NHWC) + */ +class ConvertMatmulWithBiasToPointWiseConvolution : public ngraph::pass::MatcherPass { +public: + NGRAPH_RTTI_DECLARATION; + ConvertMatmulWithBiasToPointWiseConvolution(); +}; + +/** + * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout + * with transposes around it, moved add with bias and/or fake quantize before the last transpose: + * Transose (NHWC -> NCHW) + * | + * Matmul Convolution in NHWC layout + * Input1: [A, B] B > 8 -------> Input: [1, 1, A, B] + * Input2: [B, C] Kernel: [C, B, 1, 1] + * Output: [A, C] Output: [1, 1, A, C] + * | | + * Add (const) Add (const) + * | | + * FakeQuantize FakeQuantize + * | + * Transose (NCHW -> NHWC) + */ +class ConvertMatmulWithFqToPointWiseConvolution : public ngraph::pass::MatcherPass { +public: + NGRAPH_RTTI_DECLARATION; + ConvertMatmulWithFqToPointWiseConvolution(); +}; + +} // namespace GNAPluginNS \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/transformations/insert_transpose_after_convolution_or_pooling.cpp b/inference-engine/src/gna_plugin/transformations/insert_transpose_after_convolution_or_pooling.cpp index 6bfef2587aeff2..4954529762d582 100644 --- a/inference-engine/src/gna_plugin/transformations/insert_transpose_after_convolution_or_pooling.cpp +++ b/inference-engine/src/gna_plugin/transformations/insert_transpose_after_convolution_or_pooling.cpp @@ -1,6 +1,7 @@ // Copyright (C) 2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include #include "transformations/insert_transpose_after_convolution_or_pooling.hpp" @@ -16,6 +17,7 @@ using namespace GNAPluginNS; NGRAPH_RTTI_DEFINITION(InsertTransposeAfterConvOrPool, "InsertTransposeAfterConvOrPool", 0); bool InsertTransposeAfterConvOrPool::run_on_function(std::shared_ptr f) { + RUN_ON_FUNCTION_SCOPE(InsertTransposeAfterConvOrPool); bool is_graph_modfied = false; for (auto& node : f->get_ordered_ops()) { if (std::dynamic_pointer_cast(node) == nullptr && diff --git a/inference-engine/src/gna_plugin/transformations/insert_transpose_before_matmul.cpp b/inference-engine/src/gna_plugin/transformations/insert_transpose_before_matmul.cpp index 4de8966d351660..3e5c579af8f14a 100644 --- a/inference-engine/src/gna_plugin/transformations/insert_transpose_before_matmul.cpp +++ b/inference-engine/src/gna_plugin/transformations/insert_transpose_before_matmul.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include + #include "transformations/insert_transpose_before_matmul.hpp" #include @@ -13,6 +15,7 @@ using namespace GNAPluginNS; NGRAPH_RTTI_DEFINITION(InsertTransposeBeforeMatmul, "InsertTransposeBeforeMatmul", 0); InsertTransposeBeforeMatmul::InsertTransposeBeforeMatmul() { + MATCHER_SCOPE(InsertTransposeBeforeMatmul); auto reshape = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), ngraph::pattern::any_input()}, ngraph::pattern::rank_equals(2)); @@ -59,6 +62,6 @@ InsertTransposeBeforeMatmul::InsertTransposeBeforeMatmul() { return true; }; - auto m = std::make_shared(root, "InsertTransposeBeforeMatmul"); + auto m = std::make_shared(root, matcher_name); this->register_matcher(m, callback); } diff --git a/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.cpp b/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.cpp index 1a7d6da2a33c8b..e1cfdefa31177c 100644 --- a/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.cpp +++ b/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include + #include "transformations/remove_extra_reshapes.hpp" #include @@ -12,6 +14,7 @@ using namespace GNAPluginNS; NGRAPH_RTTI_DEFINITION(RemoveExtraReshapes, "RemoveExtraReshapes", 0); RemoveExtraReshapes::RemoveExtraReshapes() { + MATCHER_SCOPE(RemoveExtraReshapes); const auto reshape = ngraph::pattern::wrap_type(); const auto pooling = ngraph::pattern::wrap_type({reshape}); @@ -26,6 +29,6 @@ RemoveExtraReshapes::RemoveExtraReshapes() { return true; }; - auto m = std::make_shared(pooling, "RemoveExtraReshapes"); + auto m = std::make_shared(pooling, matcher_name); this->register_matcher(m, callback); } diff --git a/inference-engine/src/gna_plugin/transformations/reorder_activation_and_pooling.cpp b/inference-engine/src/gna_plugin/transformations/reorder_activation_and_pooling.cpp index 69bab295ba75e2..7e67d900e38423 100644 --- a/inference-engine/src/gna_plugin/transformations/reorder_activation_and_pooling.cpp +++ b/inference-engine/src/gna_plugin/transformations/reorder_activation_and_pooling.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include + #include "transformations/reorder_activation_and_pooling.hpp" #include @@ -15,6 +17,7 @@ using namespace GNAPluginNS; NGRAPH_RTTI_DEFINITION(ReorderActivationAndPooling, "ReorderActivationAndPooling", 0); ReorderActivationAndPooling::ReorderActivationAndPooling() { + MATCHER_SCOPE(ReorderActivationAndPooling); auto conv = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), ngraph::pattern::any_input()}); auto add = ngraph::pattern::wrap_type({conv, ngraph::pattern::any_input()}); @@ -63,6 +66,6 @@ ReorderActivationAndPooling::ReorderActivationAndPooling() { return true; }; - auto m = std::make_shared(pool, "ReorderActivationAndPooling"); + auto m = std::make_shared(pool, matcher_name); this->register_matcher(m, callback); } diff --git a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp new file mode 100644 index 00000000000000..2e750308e5f3d1 --- /dev/null +++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp @@ -0,0 +1,135 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include + +#include "transformations/split_convolution_with_large_buffer_size.hpp" + +#include + +#include +#include +#include + +#include "backend/gna_limitations.hpp" + +using namespace GNAPluginNS; + +NGRAPH_RTTI_DEFINITION(SplitConvolution, "SplitConvolution", 0); +NGRAPH_RTTI_DEFINITION(SplitConvolutionWithBias, "SplitConvolutionWithBias", 0); +NGRAPH_RTTI_DEFINITION(SplitConvolutionWithFq, "SplitConvolutionWithFq", 0); + +static std::vector GetConvSplitSizes(std::shared_ptr conv) { + uint32_t width = conv->get_input_shape(0).back(); + uint32_t in_channels = conv->get_input_shape(0).at(1); + uint32_t usedWidth = 0; + std::vector split_sizes; + uint32_t width_max_size = GNALimitations::bufferMaxSize / in_channels; + width_max_size = width_max_size - width_max_size % 64; + while (usedWidth < width) { + uint32_t width_part = std::min(width - usedWidth, width_max_size); + split_sizes.push_back(width_part); + usedWidth += width_part; + } + IE_ASSERT(usedWidth == width); + return split_sizes; +} + +static bool Convert(std::shared_ptr conv, + std::shared_ptr add, + std::shared_ptr bias, + std::shared_ptr fq) { + auto input_size = std::accumulate(std::begin(conv->get_input_shape(0)), + std::end(conv->get_input_shape(0)), 1, std::multiplies()); + if (input_size <= GNALimitations::bufferMaxSize) { + return false; + } + + auto split_sizes = GetConvSplitSizes(conv); + IE_ASSERT(split_sizes.size() > 1); + + /* TODO check if it's NHWC convolution wrapped with transposes or all input dimensions except of width == 1, + otherwise this split axis isn't supported */ + const int64_t width_axis = conv->get_input_shape(0).size() - 1; + auto split_node = std::make_shared(conv->input_value(0), + ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector{width_axis}), + ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_sizes.size()}), split_sizes)); + split_node->set_friendly_name(conv->get_friendly_name() + "/split"); + ngraph::OutputVector convOutputs; + std::shared_ptr root_node = fq ? fq : (add ? add : conv); + for (int i = 0; i < split_sizes.size(); ++i) { + std::shared_ptr output = conv->clone_with_new_inputs({split_node->output(i), conv->input_value(1)}); + output->set_friendly_name(conv->get_friendly_name() + "_" + std::to_string(i)); + if (bias) { + output = std::make_shared(output, bias); + } + + if (fq) { + output = fq->clone_with_new_inputs({output, fq->input_value(1), fq->input_value(2), + fq->input_value(3), fq->input_value(4)}); + } + convOutputs.push_back(output); + } + + auto concat = std::make_shared(convOutputs, width_axis); + concat->set_friendly_name(conv->get_friendly_name()); + ngraph::replace_node(root_node, concat); + return true; +} + +SplitConvolution::SplitConvolution() { + MATCHER_SCOPE(SplitConvolution); + auto conv = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), + ngraph::pattern::any_input()}); + + ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) { + const auto& pattern_map = m.get_pattern_value_map(); + return Convert(pattern_map.at(conv).get_node_shared_ptr(), nullptr, nullptr, nullptr); + }; + + auto m = std::make_shared(conv, matcher_name); + this->register_matcher(m, callback); +} + +SplitConvolutionWithBias::SplitConvolutionWithBias() { + MATCHER_SCOPE(SplitConvolutionWithBias); + auto conv = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), + ngraph::pattern::any_input()}); + auto bias = ngraph::pattern::wrap_type(); + auto add = ngraph::pattern::wrap_type({conv, bias}); + + ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) { + const auto& pattern_map = m.get_pattern_value_map(); + return Convert(pattern_map.at(conv).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(), + pattern_map.at(bias).get_node_shared_ptr(), nullptr); + }; + + auto m = std::make_shared(add, matcher_name); + this->register_matcher(m, callback); +} + +SplitConvolutionWithFq::SplitConvolutionWithFq() { + MATCHER_SCOPE(SplitConvolutionWithFq); + auto conv = ngraph::pattern::wrap_type({ngraph::pattern::any_input(), + ngraph::pattern::any_input()}); + auto bias = ngraph::pattern::wrap_type(); + auto add = ngraph::pattern::wrap_type({conv, bias}); + auto conv_output = std::make_shared(ngraph::OutputVector{conv, add}); + auto out_fq = ngraph::pattern::wrap_type({conv_output, + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type(), + ngraph::pattern::wrap_type()}); + + ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) { + const auto& pattern_map = m.get_pattern_value_map(); + auto add_it = pattern_map.find(add); + auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr()); + auto bias_it = pattern_map.find(bias); + auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr()); + return Convert(pattern_map.at(conv).get_node_shared_ptr(), add_node, bias_node, pattern_map.at(out_fq).get_node_shared_ptr()); + }; + + auto m = std::make_shared(out_fq, matcher_name); + this->register_matcher(m, callback); +} \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp new file mode 100644 index 00000000000000..8667f4273bfaad --- /dev/null +++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace GNAPluginNS { + +// @brief Splits convolution with large input buffer +class SplitConvolution : public ngraph::pass::MatcherPass { +public: + NGRAPH_RTTI_DECLARATION; + SplitConvolution(); +}; + +// @brief Splits convolution with large input buffer, move add with bias to each convolution before concat +class SplitConvolutionWithBias : public ngraph::pass::MatcherPass { +public: + NGRAPH_RTTI_DECLARATION; + SplitConvolutionWithBias(); +}; + +/* @brief Splits convolution with large input buffer, + * move add with bias and/or fake quantize to each convolution before concat + */ +class SplitConvolutionWithFq : public ngraph::pass::MatcherPass { +public: + NGRAPH_RTTI_DECLARATION; + SplitConvolutionWithFq(); +}; + +} // namespace GNAPluginNS \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/transformations/swap_input_matmul_gna.cpp b/inference-engine/src/gna_plugin/transformations/swap_input_matmul_gna.cpp index 9a725c33cf7151..d177b83ba40e60 100644 --- a/inference-engine/src/gna_plugin/transformations/swap_input_matmul_gna.cpp +++ b/inference-engine/src/gna_plugin/transformations/swap_input_matmul_gna.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include + #include #include @@ -19,6 +21,7 @@ using namespace GNAPluginNS; NGRAPH_RTTI_DEFINITION(SwapInputMatMul, "SwapInputMatMul", 0); SwapInputMatMul::SwapInputMatMul() { + MATCHER_SCOPE(SwapInputMatMul); auto matmul = ngraph::pattern::wrap_type({ngraph::pattern::any_input( ngraph::pattern::has_static_shape()), ngraph::pattern::any_input(ngraph::pattern::has_static_shape())}, ngraph::pattern::has_static_shape()); @@ -95,6 +98,6 @@ SwapInputMatMul::SwapInputMatMul() { return true; }; - auto m = std::make_shared(matmul, "SwapInputMatMul"); + auto m = std::make_shared(matmul, matcher_name); this->register_matcher(m, callback); } \ No newline at end of file diff --git a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp index 58fb35111affab..994ba866f7a2e9 100644 --- a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp +++ b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp @@ -312,6 +312,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo struct Subgraph { ngraph::ResultVector _results; ngraph::ParameterVector _parameters; + ngraph::SinkVector _sinks; std::string _affinity; }; std::unordered_map subgraphs; @@ -325,6 +326,9 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo } else if (ngraph::op::is_parameter(node)) { subgraph._parameters.emplace_back( std::dynamic_pointer_cast(node->shared_from_this())); + } else if (ngraph::op::is_sink(node)) { + subgraph._sinks.emplace_back( + std::dynamic_pointer_cast(node->shared_from_this())); } auto itAffinity = affinities.find(node); if (itAffinity != affinities.end()) { @@ -373,7 +377,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo for (auto&& subgraph : orderedSubgraphs) { _networks[id]._device = subgraph._affinity; subFunctions[id] = - std::make_shared(subgraph._results, subgraph._parameters, + std::make_shared(subgraph._results, subgraph._sinks, subgraph._parameters, _name + '_' + std::to_string(id)); _networks[id]._clonedNetwork = CNNNetwork{subFunctions[id]}; // update of pre-processing info @@ -550,7 +554,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(std::istream& this->SetPointerToPlugin(_heteroPlugin->shared_from_this()); } -void HeteroExecutableNetwork::ExportImpl(std::ostream& heteroModel) { +void HeteroExecutableNetwork::Export(std::ostream& heteroModel) { pugi::xml_document doc; auto heteroNode = doc.append_child("hetero"); heteroNode.append_attribute("name").set_value(_name.c_str()); diff --git a/inference-engine/src/hetero_plugin/hetero_executable_network.hpp b/inference-engine/src/hetero_plugin/hetero_executable_network.hpp index 85fc8d9c19c9cd..59574ca2ce7a5f 100644 --- a/inference-engine/src/hetero_plugin/hetero_executable_network.hpp +++ b/inference-engine/src/hetero_plugin/hetero_executable_network.hpp @@ -56,7 +56,7 @@ class HeteroExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadS InferenceEngine::Parameter GetMetric(const std::string &name) const override; - void ExportImpl(std::ostream& modelFile) override; + void Export(std::ostream& modelFile) override; private: void InitCNNImpl(const InferenceEngine::CNNNetwork& network); diff --git a/inference-engine/src/hetero_plugin/hetero_plugin.cpp b/inference-engine/src/hetero_plugin/hetero_plugin.cpp index 1d8647716af8bf..09986b1e48eca8 100644 --- a/inference-engine/src/hetero_plugin/hetero_plugin.cpp +++ b/inference-engine/src/hetero_plugin/hetero_plugin.cpp @@ -57,13 +57,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(cons return std::make_shared(network, mergeConfigs(_config, config), this); } -InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetworkImpl(std::istream& heteroModel, const Configs& config) { - if (GetCore() == nullptr) { - IE_THROW() << "Please, work with HETERO device via InferencEngine::Core object"; - } - - return std::make_shared(heteroModel, - mergeConfigs(_config, config), this); +InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(std::istream& heteroModel, const std::map& config) { + return std::make_shared(heteroModel, mergeConfigs(_config, config), this); } Engine::Configs Engine::GetSupportedConfig(const Engine::Configs& config, const std::string & deviceName) const { diff --git a/inference-engine/src/hetero_plugin/hetero_plugin.hpp b/inference-engine/src/hetero_plugin/hetero_plugin.hpp index 2b5a93b829b254..fbc602116d109e 100644 --- a/inference-engine/src/hetero_plugin/hetero_plugin.hpp +++ b/inference-engine/src/hetero_plugin/hetero_plugin.hpp @@ -37,10 +37,11 @@ class Engine : public InferenceEngine::IInferencePlugin { InferenceEngine::Parameter GetConfig(const std::string& name, const std::map & options) const override; - InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& heteroModel, const Configs& config) override; + InferenceEngine::IExecutableNetworkInternal::Ptr + ImportNetwork(std::istream& heteroModel, const std::map& config) override; DeviceMetaInformationMap GetDevicePlugins(const std::string& targetFallback, - const Configs & localConfig) const; + const Configs & localConfig) const; private: Configs GetSupportedConfig(const Configs& config, const std::string & deviceName) const; diff --git a/inference-engine/src/inference_engine/CMakeLists.txt b/inference-engine/src/inference_engine/CMakeLists.txt index e8ed1a5c4c38f4..4a5577e08328f3 100644 --- a/inference-engine/src/inference_engine/CMakeLists.txt +++ b/inference-engine/src/inference_engine/CMakeLists.txt @@ -219,7 +219,7 @@ export(TARGETS ${TARGET_NAME} NAMESPACE IE:: # Export for developer package -ie_developer_export_targets(${TARGET_NAME}_plugin_api) +openvino_developer_export_targets(COMPONENT inference_engine TARGETS ${TARGET_NAME}_plugin_api) # install TBB diff --git a/inference-engine/src/inference_engine/compilation_context.cpp b/inference-engine/src/inference_engine/compilation_context.cpp index ae6aa698eafa48..1b656200996657 100644 --- a/inference-engine/src/inference_engine/compilation_context.cpp +++ b/inference-engine/src/inference_engine/compilation_context.cpp @@ -42,7 +42,7 @@ static int32_t as_int32_t(T v) { } class OstreamHashWrapper final: public std::streambuf { - std::size_t m_res = {}; + std::size_t m_res = 0; public: std::size_t getResult() const { return m_res; } std::streamsize xsputn(const char* s, std::streamsize n) override { @@ -65,7 +65,7 @@ class OstreamHashWrapper final: public std::streambuf { ////////////////////////////////////////////////// std::string NetworkCompilationContext::calculateFileInfo(const std::string& filePath) { - size_t seed {}; + size_t seed = 0; auto absPath = filePath; try { absPath = FileUtils::absoluteFilePath(filePath); diff --git a/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp b/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp index 6de781d11e62ec..a4afee5a28ba2b 100644 --- a/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp +++ b/inference-engine/src/inference_engine/cpp/ie_executable_network.cpp @@ -25,47 +25,15 @@ ExecutableNetwork::ExecutableNetwork(const details::SharedObjectLoader& so, IE_SUPPRESS_DEPRECATED_START -ExecutableNetwork::ExecutableNetwork(IExecutableNetwork::Ptr exec, - std::shared_ptr splg) - : _so(), _impl(), actual(exec) { - if (splg) { - _so = *splg; - } - - // plg can be null, but not the actual - if (actual == nullptr) - IE_THROW(NotAllocated) << "ExecutableNetwork was not initialized."; -} - ConstOutputsDataMap ExecutableNetwork::GetOutputsInfo() const { - if (actual) { - ConstOutputsDataMap data; - CALL_STATUS_FNC(GetOutputsInfo, data); - return data; - } - EXEC_NET_CALL_STATEMENT(return _impl->GetOutputsInfo()); } ConstInputsDataMap ExecutableNetwork::GetInputsInfo() const { - if (actual) { - ConstInputsDataMap info; - CALL_STATUS_FNC(GetInputsInfo, info); - return info; - } - EXEC_NET_CALL_STATEMENT(return _impl->GetInputsInfo()); } void ExecutableNetwork::reset(IExecutableNetwork::Ptr newActual) { - if (actual) { - if (newActual == nullptr) { - THROW_IE_EXCEPTION << "ExecutableNetwork wrapper used for reset was not initialized."; - } - this->actual.swap(newActual); - return; - } - if (_impl == nullptr) IE_THROW() << "ExecutableNetwork was not initialized."; if (newActual == nullptr) IE_THROW() << "ExecutableNetwork wrapper used for reset was not initialized."; auto newBase = std::dynamic_pointer_cast(newActual); @@ -76,36 +44,10 @@ void ExecutableNetwork::reset(IExecutableNetwork::Ptr newActual) { } ExecutableNetwork::operator IExecutableNetwork::Ptr() { - if (actual) { - return actual; - } - return std::make_shared(_impl); } std::vector ExecutableNetwork::QueryState() { - if (actual) { - if (actual == nullptr) THROW_IE_EXCEPTION << "ExecutableNetwork was not initialized."; - IVariableState::Ptr pState = nullptr; - auto res = OK; - std::vector controller; - for (size_t idx = 0; res == OK; ++idx) { - ResponseDesc resp; - IE_SUPPRESS_DEPRECATED_START - res = actual->QueryState(pState, idx, &resp); - IE_SUPPRESS_DEPRECATED_END - if (res != OK && res != OUT_OF_BOUNDS) { - THROW_IE_EXCEPTION << resp.msg; - } - if (res != OUT_OF_BOUNDS) { - controller.push_back(VariableState(pState, - std::make_shared(_so))); - } - } - - return controller; - } - std::vector controller; EXEC_NET_CALL_STATEMENT( for (auto&& state : _impl->QueryState()) { @@ -115,13 +57,6 @@ std::vector ExecutableNetwork::QueryState() { } InferRequest ExecutableNetwork::CreateInferRequest() { - if (actual) { - IInferRequest::Ptr req; - CALL_STATUS_FNC(CreateInferRequest, req); - if (req.get() == nullptr) THROW_IE_EXCEPTION << "Internal error: pointer to infer request is null"; - return InferRequest(req, std::make_shared(_so)); - } - EXEC_NET_CALL_STATEMENT(return {_so, _impl->CreateInferRequest()}); } @@ -130,72 +65,38 @@ InferRequest::Ptr ExecutableNetwork::CreateInferRequestPtr() { } void ExecutableNetwork::Export(const std::string& modelFileName) { - if (actual) { - CALL_STATUS_FNC(Export, modelFileName); - return; - } EXEC_NET_CALL_STATEMENT(_impl->Export(modelFileName)); } void ExecutableNetwork::Export(std::ostream& networkModel) { - if (actual) { - CALL_STATUS_FNC(Export, networkModel); - return; - } EXEC_NET_CALL_STATEMENT(_impl->Export(networkModel)); } CNNNetwork ExecutableNetwork::GetExecGraphInfo() { - if (actual) { - IE_SUPPRESS_DEPRECATED_START - ICNNNetwork::Ptr ptr = nullptr; - CALL_STATUS_FNC(GetExecGraphInfo, ptr); - return CNNNetwork(ptr); - IE_SUPPRESS_DEPRECATED_END - } EXEC_NET_CALL_STATEMENT(return _impl->GetExecGraphInfo()); } void ExecutableNetwork::SetConfig(const std::map& config) { - if (actual) { - CALL_STATUS_FNC(SetConfig, config); - return; - } EXEC_NET_CALL_STATEMENT(_impl->SetConfig(config)); } Parameter ExecutableNetwork::GetConfig(const std::string& name) const { - if (actual) { - Parameter configValue; - CALL_STATUS_FNC(GetConfig, name, configValue); - return configValue; - } EXEC_NET_CALL_STATEMENT(return _impl->GetConfig(name)); } Parameter ExecutableNetwork::GetMetric(const std::string& name) const { - if (actual) { - Parameter metricValue; - CALL_STATUS_FNC(GetMetric, name, metricValue); - return metricValue; - } EXEC_NET_CALL_STATEMENT(return _impl->GetMetric(name)); } RemoteContext::Ptr ExecutableNetwork::GetContext() const { - if (actual) { - RemoteContext::Ptr pContext; - CALL_STATUS_FNC(GetContext, pContext); - return pContext; - } EXEC_NET_CALL_STATEMENT(return _impl->GetContext()); } bool ExecutableNetwork::operator!() const noexcept { - return !_impl || !actual; + return !_impl; } ExecutableNetwork::operator bool() const noexcept { - return !!_impl || !!actual; + return !!_impl; } } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/cpp/ie_executable_network_base.hpp b/inference-engine/src/inference_engine/cpp/ie_executable_network_base.hpp index 2f813c0b783560..c87b1fc7098a2c 100644 --- a/inference-engine/src/inference_engine/cpp/ie_executable_network_base.hpp +++ b/inference-engine/src/inference_engine/cpp/ie_executable_network_base.hpp @@ -18,7 +18,6 @@ #include #include #include "cpp/exception2status.hpp" -#include "ie_variable_state_base.hpp" #include "ie_infer_async_request_base.hpp" namespace InferenceEngine { @@ -64,29 +63,10 @@ class ExecutableNetworkBase : public IExecutableNetwork { TO_STATUS(_impl->Export(networkModel)); } - IE_SUPPRESS_DEPRECATED_START StatusCode GetExecGraphInfo(ICNNNetwork::Ptr& graphPtr, ResponseDesc* resp) noexcept override { - // should be refactored together with ExecutableNetwork interface TO_STATUS(graphPtr = _impl->GetExecGraphInfo()); } - INFERENCE_ENGINE_DEPRECATED("Use InferRequest::QueryState instead") - StatusCode QueryState(IVariableState::Ptr& pState, size_t idx, ResponseDesc* resp) noexcept override { - try { - auto v = _impl->QueryState(); - if (idx >= v.size()) { - return OUT_OF_BOUNDS; - } - pState = std::make_shared(v[idx]); - return OK; - } catch (const std::exception& ex) { - return InferenceEngine::DescriptionBuffer(GENERAL_ERROR, resp) << ex.what(); - } catch (...) { - return InferenceEngine::DescriptionBuffer(UNEXPECTED); - } - } - IE_SUPPRESS_DEPRECATED_END - StatusCode SetConfig(const std::map& config, ResponseDesc* resp) noexcept override { TO_STATUS(_impl->SetConfig(config)); } diff --git a/inference-engine/src/inference_engine/cpp/ie_infer_async_request_base.hpp b/inference-engine/src/inference_engine/cpp/ie_infer_async_request_base.hpp index 1253947eeafd15..6ede78f720e3a2 100644 --- a/inference-engine/src/inference_engine/cpp/ie_infer_async_request_base.hpp +++ b/inference-engine/src/inference_engine/cpp/ie_infer_async_request_base.hpp @@ -10,10 +10,10 @@ #include "cpp/exception2status.hpp" #include "cpp_interfaces/plugin_itt.hpp" -#include "ie_variable_state_base.hpp" #include #include "ie_iinfer_request.hpp" #include "ie_preprocess.hpp" + namespace InferenceEngine { #define CATCH_IE_EXCEPTION_TO_STATUS_NO_RESP(StatusCode, ExceptionType) catch (const ExceptionType& ex) { \ @@ -169,23 +169,6 @@ class InferRequestBase : public IInferRequest { StatusCode SetBatch(int batch_size, ResponseDesc* resp) noexcept override { TO_STATUS(_impl->SetBatch(batch_size)); } - - IE_SUPPRESS_DEPRECATED_START - StatusCode QueryState(IVariableState::Ptr& pState, size_t idx, ResponseDesc* resp) noexcept override { - try { - auto v = _impl->QueryState(); - if (idx >= v.size()) { - return OUT_OF_BOUNDS; - } - pState = std::make_shared(v[idx]); - return OK; - } catch (const std::exception& ex) { - return InferenceEngine::DescriptionBuffer(GENERAL_ERROR, resp) << ex.what(); - } catch (...) { - return InferenceEngine::DescriptionBuffer(UNEXPECTED); - } - } - IE_SUPPRESS_DEPRECATED_END }; IE_SUPPRESS_DEPRECATED_END diff --git a/inference-engine/src/inference_engine/cpp/ie_infer_request.cpp b/inference-engine/src/inference_engine/cpp/ie_infer_request.cpp index 97fba9af7f924f..9e68666b7a36f6 100644 --- a/inference-engine/src/inference_engine/cpp/ie_infer_request.cpp +++ b/inference-engine/src/inference_engine/cpp/ie_infer_request.cpp @@ -23,44 +23,17 @@ namespace InferenceEngine { InferRequest::InferRequest(const details::SharedObjectLoader& so, const IInferRequestInternal::Ptr& impl) - : _so(so), _impl(impl), actual() { + : _so(so), _impl(impl) { IE_ASSERT(_impl != nullptr); } IE_SUPPRESS_DEPRECATED_START -InferRequest::InferRequest(IInferRequest::Ptr request, - std::shared_ptr splg) - : _so(), _impl(), actual(request) { - if (splg) { - _so = *splg; - } - - // plg can be null, but not the actual - if (actual == nullptr) - IE_THROW(NotAllocated) << "InferRequest was not initialized."; -} - void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) { - if (actual) { - CALL_STATUS_FNC(SetBlob, name.c_str(), data); - return; - } INFER_REQ_CALL_STATEMENT(_impl->SetBlob(name, data);) } Blob::Ptr InferRequest::GetBlob(const std::string& name) { - if (actual) { - Blob::Ptr data; - CALL_STATUS_FNC(GetBlob, name.c_str(), data); - std::string error = "Internal error: blob with name `" + name + "` is not allocated!"; - auto blobPtr = data.get(); - const bool remoteBlobPassed = blobPtr->is(); - if (blobPtr == nullptr) IE_THROW() << error; - if (!remoteBlobPassed && blobPtr->buffer() == nullptr) IE_THROW() << error; - return data; - } - Blob::Ptr blobPtr; INFER_REQ_CALL_STATEMENT(blobPtr = _impl->GetBlob(name);) std::string error = "Internal error: blob with name `" + name + "` is not allocated!"; @@ -71,60 +44,26 @@ Blob::Ptr InferRequest::GetBlob(const std::string& name) { } void InferRequest::SetBlob(const std::string &name, const Blob::Ptr &data, const PreProcessInfo& info) { - if (actual) { - CALL_STATUS_FNC(SetBlob, name.c_str(), data, info); - return; - } - INFER_REQ_CALL_STATEMENT(_impl->SetBlob(name, data, info);) } const PreProcessInfo& InferRequest::GetPreProcess(const std::string& name) const { - if (actual) { - const PreProcessInfo* info = nullptr; - CALL_STATUS_FNC(GetPreProcess, name.c_str(), &info); - return *info; - } - INFER_REQ_CALL_STATEMENT(return _impl->GetPreProcess(name);) } void InferRequest::Infer() { - if (actual) { - CALL_STATUS_FNC_NO_ARGS(Infer); - return; - } - INFER_REQ_CALL_STATEMENT(_impl->Infer();) } void InferRequest::Cancel() { - if (actual) { - CALL_STATUS_FNC_NO_ARGS(Cancel); - return; - } - INFER_REQ_CALL_STATEMENT(_impl->Cancel();) } std::map InferRequest::GetPerformanceCounts() const { - if (actual) { - std::map perfMap; - CALL_STATUS_FNC(GetPerformanceCounts, perfMap); - return perfMap; - } - INFER_REQ_CALL_STATEMENT(return _impl->GetPerformanceCounts();) } void InferRequest::SetInput(const BlobMap& inputs) { - if (actual) { - for (auto&& input : inputs) { - CALL_STATUS_FNC(SetBlob, input.first.c_str(), input.second); - } - return; - } - INFER_REQ_CALL_STATEMENT( for (auto&& input : inputs) { _impl->SetBlob(input.first, input.second); @@ -133,13 +72,6 @@ void InferRequest::SetInput(const BlobMap& inputs) { } void InferRequest::SetOutput(const BlobMap& results) { - if (actual) { - for (auto&& result : results) { - CALL_STATUS_FNC(SetBlob, result.first.c_str(), result.second); - } - return; - } - INFER_REQ_CALL_STATEMENT( for (auto&& result : results) { _impl->SetBlob(result.first, result.second); @@ -148,106 +80,19 @@ void InferRequest::SetOutput(const BlobMap& results) { } void InferRequest::SetBatch(const int batch) { - if (actual) { - CALL_STATUS_FNC(SetBatch, batch); - return; - } - INFER_REQ_CALL_STATEMENT(_impl->SetBatch(batch);) } void InferRequest::StartAsync() { - if (actual) { - CALL_STATUS_FNC_NO_ARGS(StartAsync); - return; - } - INFER_REQ_CALL_STATEMENT(_impl->StartAsync();) } StatusCode InferRequest::Wait(int64_t millis_timeout) { - if (actual) { - ResponseDesc resp; - if (actual == nullptr) IE_THROW() << "InferRequest was not initialized."; - auto res = actual->Wait(millis_timeout, &resp); - if (res != OK && res != RESULT_NOT_READY && - res != INFER_NOT_STARTED && res != INFER_CANCELLED) { - IE_EXCEPTION_SWITCH(res, ExceptionType, - InferenceEngine::details::ThrowNow{} - <<= std::stringstream{} << IE_LOCATION << resp.msg) - } - return res; - } - INFER_REQ_CALL_STATEMENT(return _impl->Wait(millis_timeout);) } -namespace details { - -class ICompletionCallbackWrapper { -public: - virtual ~ICompletionCallbackWrapper() = default; - - virtual void call(InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode code) const noexcept = 0; -}; - -template -class CompletionCallbackWrapper : public ICompletionCallbackWrapper { - T lambda; - -public: - explicit CompletionCallbackWrapper(const T& lambda): lambda(lambda) {} - - void call(InferenceEngine::IInferRequest::Ptr /*request*/, InferenceEngine::StatusCode /*code*/) const - noexcept override { - lambda(); - } -}; - -template <> -class CompletionCallbackWrapper : public ICompletionCallbackWrapper { - IInferRequest::CompletionCallback callBack; - -public: - explicit CompletionCallbackWrapper(const IInferRequest::CompletionCallback& callBack): callBack(callBack) {} - - void call(InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode code) const noexcept override { - callBack(request, code); - } -}; - -template <> -class CompletionCallbackWrapper> : public ICompletionCallbackWrapper { - std::function lambda; - -public: - explicit CompletionCallbackWrapper(const std::function& lambda) - : lambda(lambda) {} - - void call(InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode code) const noexcept override { - lambda(InferRequest(request), code); - } -}; - -void callWrapper(InferenceEngine::IInferRequest::Ptr request, InferenceEngine::StatusCode code) { - details::ICompletionCallbackWrapper* pWrapper = nullptr; - ResponseDesc dsc; - request->GetUserData(reinterpret_cast(&pWrapper), &dsc); - pWrapper->call(request, code); -} - -} // namespace details - void InferRequest::SetCompletionCallbackImpl(std::function callbackToSet) { - if (actual) { - using T = std::function; - callback.reset(new details::CompletionCallbackWrapper(callbackToSet)); - CALL_STATUS_FNC(SetUserData, callback.get()); - actual->SetCompletionCallback(InferenceEngine::details::callWrapper); - return; - } - INFER_REQ_CALL_STATEMENT( _impl->SetCallback([callbackToSet] (std::exception_ptr) { callbackToSet(); @@ -274,14 +119,6 @@ void InferRequest::SetCompletionCallbackImpl(std::function callbackToSet void InferRequest::SetCompletionCallbackImpl(std::function callbackToSet) { - if (actual) { - using T = std::function; - callback.reset(new details::CompletionCallbackWrapper(callbackToSet)); - CALL_STATUS_FNC(SetUserData, callback.get()); - actual->SetCompletionCallback(InferenceEngine::details::callWrapper); - return; - } - INFER_REQ_CALL_STATEMENT( auto weakThis = InferRequest{_so, std::shared_ptr{_impl.get(), [](IInferRequestInternal*){}}}; _impl->SetCallback([callbackToSet, weakThis] (std::exception_ptr exceptionPtr) { @@ -303,14 +140,6 @@ void InferRequest::SetCompletionCallbackImpl(std::function(callbackToSet)); - CALL_STATUS_FNC(SetUserData, callback.get()); - actual->SetCompletionCallback(InferenceEngine::details::callWrapper); - return; - } - INFER_REQ_CALL_STATEMENT( IInferRequest::Ptr weakThis = InferRequest{_so, std::shared_ptr{_impl.get(), [](IInferRequestInternal*){}}}; _impl->SetCallback([callbackToSet, weakThis] (std::exception_ptr exceptionPtr) { @@ -332,38 +161,12 @@ void InferRequest::SetCompletionCallbackImpl(IInferRequest::CompletionCallback c } InferRequest::operator IInferRequest::Ptr () { - if (actual) { - return actual; - } - INFER_REQ_CALL_STATEMENT( return std::make_shared(_impl); ) } std::vector InferRequest::QueryState() { - if (actual) { - IE_SUPPRESS_DEPRECATED_START - if (actual == nullptr) IE_THROW() << "ExecutableNetwork was not initialized."; - IVariableState::Ptr pState = nullptr; - auto res = OK; - std::vector controller; - for (size_t idx = 0; res == OK; ++idx) { - ResponseDesc resp; - res = actual->QueryState(pState, idx, &resp); - if (res != OK && res != OUT_OF_BOUNDS) { - IE_THROW() << resp.msg; - } - if (res != OUT_OF_BOUNDS) { - controller.push_back(VariableState(pState, - std::make_shared(_so))); - } - } - IE_SUPPRESS_DEPRECATED_END - - return controller; - } - std::vector controller; INFER_REQ_CALL_STATEMENT( for (auto&& state : _impl->QueryState()) { @@ -374,11 +177,11 @@ std::vector InferRequest::QueryState() { } bool InferRequest::operator!() const noexcept { - return !_impl || !actual; + return !_impl; } InferRequest::operator bool() const noexcept { - return (!!_impl) || (!!actual); + return (!!_impl); } bool InferRequest::operator!=(const InferRequest& r) const noexcept { @@ -386,7 +189,7 @@ bool InferRequest::operator!=(const InferRequest& r) const noexcept { } bool InferRequest::operator==(const InferRequest& r) const noexcept { - return r._impl == _impl && r.actual == actual; + return r._impl == _impl; } } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/cpp/ie_variable_state.cpp b/inference-engine/src/inference_engine/cpp/ie_variable_state.cpp index 46f99d3fc6c1d0..63f7305e8b2b38 100644 --- a/inference-engine/src/inference_engine/cpp/ie_variable_state.cpp +++ b/inference-engine/src/inference_engine/cpp/ie_variable_state.cpp @@ -4,7 +4,6 @@ #include "details/ie_so_loader.h" #include "cpp/ie_memory_state.hpp" -#include "ie_imemory_state.hpp" #include "cpp_interfaces/interface/ie_ivariable_state_internal.hpp" #include "exception2status.hpp" @@ -24,57 +23,19 @@ VariableState::VariableState(const details::SharedObjectLoader& so, IE_SUPPRESS_DEPRECATED_START -VariableState::VariableState(std::shared_ptr state, - std::shared_ptr splg) - : _so(), _impl(), actual(state) { - if (splg) { - _so = *splg; - } - - // plg can be null, but not the actual - if (actual == nullptr) - IE_THROW(NotAllocated) << "VariableState was not initialized."; -} - -Blob::CPtr VariableState::GetLastState() const { - return GetState(); -} - void VariableState::Reset() { - if (actual) { - CALL_STATUS_FNC_NO_ARGS(Reset); - return; - } - VARIABLE_CALL_STATEMENT(_impl->Reset()); } std::string VariableState::GetName() const { - if (actual) { - char name[256]; - CALL_STATUS_FNC(GetName, name, sizeof(name)); - return name; - } - VARIABLE_CALL_STATEMENT(return _impl->GetName()); } Blob::CPtr VariableState::GetState() const { - if (actual) { - Blob::CPtr stateBlob; - CALL_STATUS_FNC(GetState, stateBlob); - return stateBlob; - } - VARIABLE_CALL_STATEMENT(return _impl->GetState()); } void VariableState::SetState(Blob::Ptr state) { - if (actual) { - CALL_STATUS_FNC(SetState, state); - return; - } - VARIABLE_CALL_STATEMENT(_impl->SetState(state)); } diff --git a/inference-engine/src/inference_engine/cpp/ie_variable_state_base.hpp b/inference-engine/src/inference_engine/cpp/ie_variable_state_base.hpp deleted file mode 100644 index 2481ca67852ddb..00000000000000 --- a/inference-engine/src/inference_engine/cpp/ie_variable_state_base.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#include "cpp/exception2status.hpp" -#include "cpp_interfaces/interface/ie_ivariable_state_internal.hpp" -#include "ie_imemory_state.hpp" - -namespace InferenceEngine { - -IE_SUPPRESS_DEPRECATED_START - -/** - * @brief Default implementation for IVariableState - * @ingroup ie_dev_api_variable_state_api - */ -class VariableStateBase : public IVariableState { - std::shared_ptr impl; - -public: - /** - * @brief Constructor with actual underlying implementation. - * @param impl Underlying implementation of type IVariableStateInternal - */ - explicit VariableStateBase(std::shared_ptr impl): impl(impl) { - if (impl == nullptr) { - IE_THROW() << "VariableStateBase implementation is not defined"; - } - } - - StatusCode GetName(char* name, size_t len, ResponseDesc* resp) const noexcept override { - for (size_t i = 0; i != len; i++) { - name[i] = 0; - } - DescriptionBuffer buf(name, len); - TO_STATUS(buf << impl->GetName()); - return OK; - } - - StatusCode Reset(ResponseDesc* resp) noexcept override { - TO_STATUS(impl->Reset()); - } - - StatusCode SetState(Blob::Ptr newState, ResponseDesc* resp) noexcept override { - TO_STATUS(impl->SetState(newState)); - } - - StatusCode GetState(Blob::CPtr& state, ResponseDesc* resp) const noexcept override { - TO_STATUS(state = impl->GetState()); - } -}; - -IE_SUPPRESS_DEPRECATED_END - -} // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp index bf3086551c15fa..6b5bb34c97074e 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp @@ -49,19 +49,17 @@ std::shared_ptr IExecutableNetworkInternal::CreateInferRe } void IExecutableNetworkInternal::Export(const std::string& modelFileName) { - // we need to write to stringstream first - // because in case of exception in ExportImpl the file is not created - std::stringstream strm; - ExportImpl(strm); - std::ofstream(modelFileName.c_str()) << strm.rdbuf(); + std::ofstream modelFile(modelFileName, std::ios::out | std::ios::binary); + + if (modelFile.is_open()) { + Export(modelFile); + } else { + IE_THROW() << "The " << modelFileName << " file can not be opened for Export"; + } } void IExecutableNetworkInternal::Export(std::ostream& networkModel) { - std::stringstream strm; - strm.write(exportMagic.data(), exportMagic.size()); - strm << _plugin->GetName() << std::endl; - ExportImpl(strm); - networkModel << strm.rdbuf(); + IE_THROW(NotImplemented); } CNNNetwork IExecutableNetworkInternal::GetExecGraphInfo() { @@ -97,7 +95,4 @@ std::shared_ptr IExecutableNetworkInternal::CreateInferRe IE_THROW(NotImplemented); } -void IExecutableNetworkInternal::ExportImpl(std::ostream&) { - IE_THROW(NotImplemented); -} } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp index 5637701754e3f5..88599aa78b37d2 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp @@ -16,24 +16,12 @@ #include #include +#include #include #include #include namespace InferenceEngine { -namespace { -void parsePluginName(std::istream& networkModel) { - ExportMagic magic = {}; - auto currentPos = networkModel.tellg(); - networkModel.read(magic.data(), magic.size()); - auto exportedWithName = (exportMagic == magic); - if (exportedWithName) { - networkModel.ignore(std::numeric_limits::max(), '\n'); - } else { - networkModel.seekg(currentPos, networkModel.beg); - } -} -} // namespace PreProcessInfo copyPreProcess(const PreProcessInfo& from) { PreProcessInfo to = from; @@ -170,22 +158,26 @@ RemoteContext::Ptr IInferencePlugin::GetDefaultContext(const ParamMap&) { IE_THROW(NotImplemented); } -std::shared_ptr IInferencePlugin::ImportNetwork(const std::string&, - const std::map&) { - IE_THROW(NotImplemented); +std::shared_ptr IInferencePlugin::ImportNetwork(const std::string& modelFileName, + const std::map& config) { + std::ifstream blobFile(modelFileName, std::ios::binary); + + if (!blobFile.is_open()) { + IE_THROW(NetworkNotRead); + } + + return ImportNetwork(blobFile, config); } std::shared_ptr IInferencePlugin::ImportNetwork(std::istream& networkModel, const std::map& config) { - parsePluginName(networkModel); - return ImportNetworkImpl(networkModel, config); + IE_THROW(NotImplemented); } std::shared_ptr IInferencePlugin::ImportNetwork(std::istream& networkModel, const std::shared_ptr& context, const std::map& config) { - parsePluginName(networkModel); - return ImportNetworkImpl(networkModel, context, config); + IE_THROW(NotImplemented); } void IInferencePlugin::SetCore(ICore* core) { @@ -213,17 +205,6 @@ std::shared_ptr IInferencePlugin::LoadExeNetworkImpl IE_THROW(NotImplemented); } -std::shared_ptr IInferencePlugin::ImportNetworkImpl(std::istream&, - const std::map&) { - IE_THROW(NotImplemented); -} - -std::shared_ptr IInferencePlugin::ImportNetworkImpl(std::istream&, - const std::shared_ptr&, - const std::map&) { - IE_THROW(NotImplemented); -} - void IInferencePlugin::SetExeNetworkInfo(const std::shared_ptr& exeNetwork, const ConstInputsDataMap& inputs, const ConstOutputsDataMap& outputs) { diff --git a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_ivariable_state_internal.cpp b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_ivariable_state_internal.cpp index 0171292d36bbdc..a499e816ee0b2c 100644 --- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_ivariable_state_internal.cpp +++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_ivariable_state_internal.cpp @@ -23,7 +23,4 @@ Blob::CPtr IVariableStateInternal::GetState() const { return state; } -Blob::CPtr IVariableStateInternal::GetLastState() const { - return GetState(); -} } // namespace InferenceEngine diff --git a/inference-engine/src/inference_engine/ie_core.cpp b/inference-engine/src/inference_engine/ie_core.cpp index 28563a29b62d30..63814215037f16 100644 --- a/inference-engine/src/inference_engine/ie_core.cpp +++ b/inference-engine/src/inference_engine/ie_core.cpp @@ -395,6 +395,7 @@ class Core::Impl : public ICore { opsetNames.insert("opset4"); opsetNames.insert("opset5"); opsetNames.insert("opset6"); + opsetNames.insert("opset7"); } ~Impl() override = default; @@ -566,18 +567,6 @@ class Core::Impl : public ICore { SoExecutableNetworkInternal ImportNetwork(std::istream& networkModel, const std::string& deviceName, const std::map& config) override { auto parsed = parseDeviceNameIntoConfig(deviceName, config); - - if (parsed._deviceName.empty()) { - ExportMagic magic = {}; - auto currentPos = networkModel.tellg(); - networkModel.read(magic.data(), magic.size()); - auto exportedWithName = (exportMagic == magic); - if (exportedWithName) { - std::getline(networkModel, parsed._deviceName); - } - networkModel.seekg(currentPos, networkModel.beg); - } - return GetCPPPluginByName(parsed._deviceName).ImportNetwork(networkModel, parsed._config); } @@ -1022,18 +1011,6 @@ void Core::AddExtension(const IExtensionPtr& extension) { ExecutableNetwork Core::ImportNetwork(const std::string& modelFileName, const std::string& deviceName, const std::map& config) { OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork"); - - // TODO: remove once NotImplemented exception is deprecated and not used - if (deviceName.find("HETERO") == 0) { - IE_THROW() << "HETERO device does not support ImportNetwork"; - } - if (deviceName.find("MULTI") == 0) { - IE_THROW() << "MULTI device does not support ImportNetwork"; - } - if (deviceName.find("AUTO") == 0) { - IE_THROW() << "AUTO device does not support ImportNetwork"; - } - auto parsed = parseDeviceNameIntoConfig(deviceName, config); auto exec = _impl->GetCPPPluginByName(parsed._deviceName).ImportNetwork(modelFileName, parsed._config); return { exec, exec }; @@ -1041,10 +1018,33 @@ ExecutableNetwork Core::ImportNetwork(const std::string& modelFileName, const st ExecutableNetwork Core::ImportNetwork(std::istream& networkModel, const std::string& deviceName, const std::map& config) { + OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork"); auto exec = _impl->ImportNetwork(networkModel, deviceName, config); return { exec, exec }; } +ExecutableNetwork Core::ImportNetwork(std::istream& networkModel) { + OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork"); + + using ExportMagic = std::array; + constexpr static const ExportMagic exportMagic = {{0x1, 0xE, 0xE, 0x1}}; + + std::string deviceName; + ExportMagic magic = {}; + auto currentPos = networkModel.tellg(); + networkModel.read(magic.data(), magic.size()); + if (exportMagic == magic) { + std::getline(networkModel, deviceName); + } else { + IE_THROW() << "Passed compiled stream does not contain device name. " + "Please, provide device name manually"; + } + networkModel.seekg(currentPos, networkModel.beg); + + auto exec = _impl->GetCPPPluginByName(deviceName).ImportNetwork(networkModel, {}); + return { exec, exec }; +} + ExecutableNetwork Core::ImportNetwork(std::istream& networkModel, const RemoteContext::Ptr& context, const std::map& config) { @@ -1124,8 +1124,8 @@ Parameter Core::GetConfig(const std::string& deviceName, const std::string& name IE_THROW() << "You can only GetConfig of the AUTO itself (without devices). " "GetConfig is also possible for the individual devices before creating the AUTO on top."; - } - } + } + } auto parsed = parseDeviceNameIntoConfig(deviceName); diff --git a/inference-engine/src/inference_engine/ie_parameter.cpp b/inference-engine/src/inference_engine/ie_parameter.cpp deleted file mode 100644 index 61fbf54c37dcc2..00000000000000 --- a/inference-engine/src/inference_engine/ie_parameter.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include - -#include - -namespace ngraph { - -template class INFERENCE_ENGINE_API_CLASS(VariantImpl); - -template <> -class INFERENCE_ENGINE_API_CLASS(VariantWrapper) : public VariantImpl { -public: - static constexpr VariantTypeInfo type_info {"Variant::InferenceEngine::Parameter", 0}; - const VariantTypeInfo& get_type_info() const override { - return type_info; - } - VariantWrapper(const value_type& value): VariantImpl(value) {} // NOLINT -}; - -} // namespace ngraph - -constexpr ngraph::VariantTypeInfo ngraph::VariantWrapper::type_info; - -InferenceEngine::Parameter::Parameter(const std::shared_ptr& var) { - if (auto paramWrapper = std::dynamic_pointer_cast>(var)) { - auto param = paramWrapper->get(); - if (!param.empty()) ptr = param.ptr->copy(); - } -} - -InferenceEngine::Parameter::Parameter(std::shared_ptr& var) { - if (auto paramWrapper = std::dynamic_pointer_cast>(var)) { - auto param = paramWrapper->get(); - if (!param.empty()) ptr = param.ptr->copy(); - } -} - - -std::shared_ptr InferenceEngine::Parameter::asVariant() const { - return std::make_shared>(*this); -} diff --git a/inference-engine/src/inference_engine/ie_transformations.cpp b/inference-engine/src/inference_engine/ie_transformations.cpp index 15360ae97ea09a..2a87671ce25d4f 100644 --- a/inference-engine/src/inference_engine/ie_transformations.cpp +++ b/inference-engine/src/inference_engine/ie_transformations.cpp @@ -11,6 +11,16 @@ using namespace InferenceEngine; void InferenceEngine::LowLatency(InferenceEngine::CNNNetwork &network) { auto function = network.getFunction(); ngraph::pass::Manager manager; + NGRAPH_SUPPRESS_DEPRECATED_START manager.register_pass(); + NGRAPH_SUPPRESS_DEPRECATED_END + manager.run_passes(function); +} + +void InferenceEngine::lowLatency2(InferenceEngine::CNNNetwork &network, + bool use_const_initializer) { + auto function = network.getFunction(); + ngraph::pass::Manager manager; + manager.register_pass(use_const_initializer); manager.run_passes(function); } diff --git a/inference-engine/src/legacy_api/CMakeLists.txt b/inference-engine/src/legacy_api/CMakeLists.txt index 8eae82bd288b02..4cae8b7c6b1d7e 100644 --- a/inference-engine/src/legacy_api/CMakeLists.txt +++ b/inference-engine/src/legacy_api/CMakeLists.txt @@ -75,7 +75,7 @@ set_target_properties(${TARGET_NAME} ${TARGET_NAME}_obj # developer package -ie_developer_export_targets(${TARGET_NAME}) +openvino_developer_export_targets(COMPONENT inference_engine TARGETS ${TARGET_NAME}) # install diff --git a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp index 08b54640ee6e26..7d92c77219d834 100644 --- a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp +++ b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp @@ -244,6 +244,9 @@ CNNLayer::Ptr createSubGraphLayer(const std::shared_ptr& layer) { LayerParams params = {layer->get_friendly_name(), "TensorIterator", details::convertPrecision(layer->get_output_element_type(0))}; auto res = std::make_shared(params); + if (res == nullptr) { + IE_THROW() << "Can't create TensorIterator"; + } res->body = body; // Port map: outputs diff --git a/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp b/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp index a3ec122b9c7ab3..cab07f54a762e6 100644 --- a/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp +++ b/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp @@ -9,7 +9,6 @@ #include #include - #include #include @@ -137,7 +136,6 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher lb = std::min(static_cast(input_shape[input_shape_idx]), lb); ub = std::min(static_cast(input_shape[input_shape_idx]), ub); - offset.emplace_back(lb); // set default value for stride or use given value int64_t stride = 1; @@ -153,6 +151,7 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher ub = -1; lb = std::min(lb, static_cast(input_shape[input_shape_idx]) - 1); + offset.emplace_back(lb); lb -= 1; // we always get 1st element, so we need decrease range if (ub <= lb) dimension = (ub - lb) / stride + 1; @@ -160,12 +159,16 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher // apply masks if (begin_mask.count(axis)) lb = 0; - if (end_mask.count(axis)) + offset.emplace_back(lb); + + if (end_mask.count(axis)) { ub = static_cast(input_shape[input_shape_idx]); + } lb += 1; // we always get 1st element, so we need decrease range - if (ub >= lb) + if (ub >= lb) { dimension = (ub - lb) / stride + 1; + } } dim.emplace_back(dimension); diff --git a/inference-engine/src/low_precision_transformations/CMakeLists.txt b/inference-engine/src/low_precision_transformations/CMakeLists.txt index 5d0dfc04fcfe3b..c6306dbc08f067 100644 --- a/inference-engine/src/low_precision_transformations/CMakeLists.txt +++ b/inference-engine/src/low_precision_transformations/CMakeLists.txt @@ -45,7 +45,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_REL # developer package -ie_developer_export_targets(${TARGET_NAME}) +openvino_developer_export_targets(COMPONENT inference_engine TARGETS ${TARGET_NAME}) # install diff --git a/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp b/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp index 8cf52a13fe20ca..bf45cdeae94133 100644 --- a/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp +++ b/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp @@ -270,6 +270,12 @@ template std::shared_ptr fold_reshape(Args&&... args) { std::shared_ptr node = std::make_shared(std::forward(args)...); if (node->get_output_size() == 1) { + // issue #57985: remove fold_reshape & reuse nGraph implementation + const auto values = as_type_ptr(node->input_value(1).get_node_shared_ptr())->template cast_vector(); + if (std::any_of(values.begin(), values.end(), [](const int64_t value) { return (value == 0) || (value == -1); })) { + return fold(std::forward(args)...); + } + OutputVector folded; if (is_type(node->input_value(0).get_node_shared_ptr()) && is_type(node->input_value(1).get_node_shared_ptr())) { diff --git a/inference-engine/src/low_precision_transformations/src/concat.cpp b/inference-engine/src/low_precision_transformations/src/concat.cpp index 4988e29b1e289a..f6d860ed17225c 100644 --- a/inference-engine/src/low_precision_transformations/src/concat.cpp +++ b/inference-engine/src/low_precision_transformations/src/concat.cpp @@ -43,19 +43,21 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat return false; } - // precisions can be different + // Concat operations precision is defined: + // 1. consumers after Concat + // 2. FakeQuantize precisions without zero point ngraph::Node& quantizationLayer = *subgraph.quantizationLayers[0]; std::shared_ptr fq = ngraph::as_type_ptr(quantizationLayer.shared_from_this()); if (!NetworkHelper::isQuantizeSupported(fq)) { return false; } - - std::vector concatParentsChildrensPrecisions = precisionsOnActivations; - fillAvailablePrecisions(subgraph.quantizationLayers[0], concatParentsChildrensPrecisions); - if (concatParentsChildrensPrecisions.empty()) { + DataPrecision dataPrecision = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false); + if (dataPrecision.precision == ngraph::element::undefined) { return false; } + std::vector concatChildrenPrecisions = precisionsOnActivations; + for (size_t i = 0; i < subgraph.quantizationLayers.size(); ++i) { fq = ngraph::as_type_ptr(subgraph.quantizationLayers[i]); if (fq == nullptr) { @@ -72,20 +74,28 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat if (quantizationDetails.inputHighValues.size() != 1ul) { return false; } - std::vector fqChildrensPrecisions = precisionsOnActivations; - fillAvailablePrecisions(subgraph.quantizationLayers[i], fqChildrensPrecisions); - concatParentsChildrensPrecisions = NetworkHelper::precisionIntersection(concatParentsChildrensPrecisions, fqChildrensPrecisions); - if (concatParentsChildrensPrecisions.empty()) { + // define concatenation operation consumers precisions + std::vector fqChildrenPrecisions = precisionsOnActivations; + fillAvailablePrecisions(subgraph.quantizationLayers[i], fqChildrenPrecisions); + concatChildrenPrecisions = NetworkHelper::precisionIntersection(concatChildrenPrecisions, fqChildrenPrecisions); + if (concatChildrenPrecisions.empty()) { + return false; + } + + // define FakeQuantize precisions without zero point + const DataPrecision dataPrecision2 = getDataPrecision(subgraph.quantizationLayers[i]->shared_from_this(), quantizationDetails, false); + if (dataPrecision2.precision == ngraph::element::undefined) { return false; } + + if (dataPrecision.precision != dataPrecision2.precision) { + dataPrecision = dataPrecision.precision.is_signed() ? dataPrecision : dataPrecision2; + } } - DataPrecision dataPrecision; - if (std::find(concatParentsChildrensPrecisions.begin(), concatParentsChildrensPrecisions.end(), element::i8) != concatParentsChildrensPrecisions.end()) { - dataPrecision = DataPrecision(element::i8); - } else { - dataPrecision = DataPrecision(concatParentsChildrensPrecisions[0]); + if (std::find(concatChildrenPrecisions.begin(), concatChildrenPrecisions.end(), dataPrecision.precision) == concatChildrenPrecisions.end()) { + dataPrecision = DataPrecision(concatChildrenPrecisions[0]); } std::vector quantizationLayersDetails; diff --git a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp index dc81d51cd717de..e36c2b5aa74528 100644 --- a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp +++ b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp @@ -64,14 +64,23 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context DataPrecision dataPrecision; { + std::vector concatChildrenPrecisions = precisionsOnActivations; for (auto quantizationLayer : subgraph.quantizationLayers) { std::shared_ptr fq = ngraph::as_type_ptr(quantizationLayer->shared_from_this()); if (!NetworkHelper::isQuantizeSupported(fq)) { return false; } - const DataPrecision tmp = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false); + // define concatenation operation consumers precisions + std::vector fqChildrenPrecisions = precisionsOnActivations; + fillAvailablePrecisions(quantizationLayer, fqChildrenPrecisions); + concatChildrenPrecisions = NetworkHelper::precisionIntersection(concatChildrenPrecisions, fqChildrenPrecisions); + if (concatChildrenPrecisions.empty()) { + return false; + } + // define FakeQuantize precisions without zero point + const DataPrecision tmp = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false); if (dataPrecision.precision == ngraph::element::undefined) { dataPrecision = tmp; continue; @@ -81,6 +90,10 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context dataPrecision = tmp; } } + + if (std::find(concatChildrenPrecisions.begin(), concatChildrenPrecisions.end(), dataPrecision.precision) == concatChildrenPrecisions.end()) { + dataPrecision = DataPrecision(concatChildrenPrecisions[0]); + } } for (size_t i = 0; i < subgraph.quantizationLayers.size(); ++i) { diff --git a/inference-engine/src/low_precision_transformations/src/network_helper.cpp b/inference-engine/src/low_precision_transformations/src/network_helper.cpp index 4a1e942e5753ba..346034e02abd89 100644 --- a/inference-engine/src/low_precision_transformations/src/network_helper.cpp +++ b/inference-engine/src/low_precision_transformations/src/network_helper.cpp @@ -549,7 +549,11 @@ std::shared_ptr NetworkHelper::separateInStandaloneBranch(std::sha } std::vector> inputs = node->input_values(); - const size_t inputIndex = NetworkHelper::getChildInputIndex(dequantization.multiply, node); + const auto originalParent = dequantization.multiply ? + dequantization.multiply->shared_from_this() : + dequantization.subtract->shared_from_this(); + + const size_t inputIndex = NetworkHelper::getChildInputIndex(originalParent, node); inputs[inputIndex] = parent; const std::shared_ptr newNode = node->clone_with_new_inputs(inputs); @@ -683,7 +687,7 @@ std::shared_ptr NetworkHelper::foldFakeQuantize( auto levels_1 = fq->get_levels() - 1.f; const size_t DHW = D * H * W; - const size_t IDHW = IC * D * H * W; + const size_t IDHW = outChannelsShapeIndex == 0 ? IC * D * H * W : OC * D * H * W; const auto values = constant->cast_vector(); std::vector quantizedValues(OC * IC * D * H * W); diff --git a/inference-engine/src/low_precision_transformations/src/normalize_l2.cpp b/inference-engine/src/low_precision_transformations/src/normalize_l2.cpp index 4368a48075f324..93dcdf8afbe551 100644 --- a/inference-engine/src/low_precision_transformations/src/normalize_l2.cpp +++ b/inference-engine/src/low_precision_transformations/src/normalize_l2.cpp @@ -120,10 +120,10 @@ bool NormalizeL2Transformation::transform(TransformationContext &context, ngraph } auto newNormalize = std::make_shared>( - std::vector{ element::f32, element::f32 }, + std::vector{ element::f32, axes->output(0).get_element_type() }, std::vector{deqPrecision}, ngraph::op::TemporaryReplaceOutputType(dequantization.subtract == nullptr ? dequantization.data : dequantization.subtract, element::f32).get(), - ngraph::op::TemporaryReplaceOutputType(axes->clone_with_new_inputs({}), element::f32).get(), + axes, normalize->get_eps(), normalize->get_eps_mode()); NetworkHelper::copyInfo(normalize, newNormalize); diff --git a/inference-engine/src/low_precision_transformations/src/split.cpp b/inference-engine/src/low_precision_transformations/src/split.cpp index 486111dd73778a..919c6b5e87b185 100644 --- a/inference-engine/src/low_precision_transformations/src/split.cpp +++ b/inference-engine/src/low_precision_transformations/src/split.cpp @@ -111,13 +111,13 @@ void SplitTransformation::updateOutputs( updateOutput(context, lastNodes[0], originalNode); } else { const std::string originalName = originalNode->get_friendly_name(); - for (auto& lastNode : lastNodes) { + for (size_t outIdx = 0; outIdx < lastNodes.size(); ++outIdx) { for (size_t i = 0; i < outputSize; ++i) { std::shared_ptr result = context.function->get_output_op(i); std::shared_ptr outputNode = result->get_input_node_shared_ptr(0); - if (outputNode.get() == lastNode.get()) { + if (outputNode.get() == lastNodes[outIdx].get()) { originalNode->set_friendly_name(originalName + LayerTransformation::originalLayerPostfix); - lastNode->set_friendly_name(originalName + "." + std::to_string(i)); + lastNodes[outIdx]->set_friendly_name(originalName + "." + std::to_string(outIdx)); break; } } diff --git a/inference-engine/src/low_precision_transformations/src/strided_slice.cpp b/inference-engine/src/low_precision_transformations/src/strided_slice.cpp index a269e392302ce4..e3d420c849068f 100644 --- a/inference-engine/src/low_precision_transformations/src/strided_slice.cpp +++ b/inference-engine/src/low_precision_transformations/src/strided_slice.cpp @@ -17,13 +17,12 @@ std::shared_ptr stridedSliceDeqConstant( const std::shared_ptr strSlice, const std::shared_ptr dequantizaitonConstant) { auto constant = as_type_ptr(dequantizaitonConstant); - // issue #48857: constant is mistakenly recognized as a scalar. Uncomment after fix - //if (NetworkHelper::isScalarLike(constant)) { - // return NetworkHelper::toScalar(constant); - //} + auto constantShape = constant->get_shape(); + if (ngraph::shape_size(constantShape) == 1ul) { + return NetworkHelper::toScalar(constant); + } const auto stridedSliceShape = strSlice->get_input_shape(0); - auto constantShape = constant->get_shape(); if (stridedSliceShape.size() != constantShape.size()) { ngraph::Shape newConstantShape; if (ngraph::shape_size(constantShape) == 1) { diff --git a/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp b/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp index ce0ae3473d92e5..babcc95303cffe 100644 --- a/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp +++ b/inference-engine/src/low_precision_transformations/src/weightable_layer_transformation.cpp @@ -157,10 +157,15 @@ bool WeightableLayerTransformation::canBeTransformed(const TransformationContext } const size_t outChannelsShapeIndex = is_type(layer) ? 1ul : 0ul; - if ( // Check if all dimensions of scale except the output channels are all ones + if ( + // expected, it's ok: return true + (shape_size(constOutputShape) != 1ul) && + // not expected, something wrong: return false + ((constOutputShape.size() <= outChannelsShapeIndex) || + // Check if all dimensions of scale except the output channels are all ones (shape_size(constOutputShape) != constOutputShape[outChannelsShapeIndex]) || ((constOutputShape[outChannelsShapeIndex] != 1ul) && - (fqFromWeights->get_output_shape(0)[outChannelsShapeIndex] != constOutputShape[outChannelsShapeIndex]))) { + (fqFromWeights->get_output_shape(0)[outChannelsShapeIndex] != constOutputShape[outChannelsShapeIndex])))) { return false; } } else { diff --git a/inference-engine/src/mkldnn_plugin/config.h b/inference-engine/src/mkldnn_plugin/config.h index 01eb0e23c5ee1a..54336d58495276 100644 --- a/inference-engine/src/mkldnn_plugin/config.h +++ b/inference-engine/src/mkldnn_plugin/config.h @@ -4,9 +4,11 @@ #pragma once +#include +#include "utils/debug_capabilities.h" + #include #include -#include namespace MKLDNNPlugin { @@ -35,6 +37,10 @@ struct Config { bool manualEnforceBF16 = false; #endif +#ifdef CPU_DEBUG_CAPS + DebugCaps::Config debugCaps; +#endif + void readProperties(const std::map &config); void updateProperties(); std::map _config; diff --git a/inference-engine/src/mkldnn_plugin/cpu_types.h b/inference-engine/src/mkldnn_plugin/cpu_types.h index d7f55446024c8c..e5bc8af0b5c745 100644 --- a/inference-engine/src/mkldnn_plugin/cpu_types.h +++ b/inference-engine/src/mkldnn_plugin/cpu_types.h @@ -64,7 +64,28 @@ enum Type { Reference, ShuffleChannels, DFT, - Math + Math, + CTCLoss, + Bucketize, + CTCGreedyDecoder, + CTCGreedyDecoderSeqLen, + CumSum, + DetectionOutput, + ExperimentalDetectronDetectionOutput, + LogSoftmax, + TopK, + GatherTree, + GRN, + Range, + Proposal, + ReorgYolo, + ReverseSequence, + ExperimentalDetectronTopKROIs, + ExperimentalDetectronROIFeatureExtractor, + ExperimentalDetectronPriorGridGenerator, + ExperimentalDetectronGenerateProposalsSingleImage, + ExtractImagePatches, + NonMaxSuppression }; enum Algorithm { diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp index 57689d6302d70a..4d1e3819394fc8 100644 --- a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp @@ -106,7 +106,6 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 ®_src, int offset_byte, In break; case Precision::I32: if ((src_prc == Precision::FP32) || (src_prc == Precision::BF16)) { - h->uni_vroundps(Vmm(out_vec_idx), Vmm(out_vec_idx), 3); // rounding to zero h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx)); } break; @@ -511,6 +510,11 @@ size_t jit_store_emitter::aux_vecs_count() const { size_t jit_store_emitter::get_inputs_num() const { return 1; } +void jit_store_emitter::emit_data() const { + if (emu_vcvtneps2bf16) + emu_vcvtneps2bf16->emit_data(); +} + void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs, const std::vector &pool_vec_idxs, const std::vector &pool_gpr_idxs, const emitter_context *emit_context) const { @@ -552,7 +556,6 @@ template switch (src_prc) { case Precision::FP32: if ((dst_prc != Precision::FP32) && (dst_prc != Precision::BF16)) { - h->uni_vroundps(Vmm(in_vec_idx), Vmm(in_vec_idx), 3); // rounding to zero h->uni_vcvtps2dq(Vmm(in_vec_idx), Vmm(in_vec_idx)); } break; diff --git a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp index 00c2e49262d9a5..ec863d0c69e596 100644 --- a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp +++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp @@ -18,8 +18,8 @@ struct load_emitter_context : public emitter_context { load_emitter_context() : src_prc_(Precision::FP32), dst_prc_(Precision::FP32), load_num_(8), offset_byte_(0), is_fill_(false), fill_value_("zero") {} - load_emitter_context(Precision src_prc, Precision dst_prc, int load_num, bool is_fill = false, std::string fill_value = "zero", int offset_byte = 0): - src_prc_(src_prc), dst_prc_(dst_prc), load_num_(load_num), is_fill_(is_fill), fill_value_(fill_value), offset_byte_(offset_byte) {} + load_emitter_context(Precision src_prc, Precision dst_prc, int load_num, int offset_byte = 0, bool is_fill = false, std::string fill_value = "zero"): + src_prc_(src_prc), dst_prc_(dst_prc), load_num_(load_num), offset_byte_(offset_byte), is_fill_(is_fill), fill_value_(fill_value) {} int offset_byte_; int load_num_; @@ -124,6 +124,8 @@ class jit_store_emitter : public jit_emitter { size_t get_inputs_num() const override; + void emit_data() const override; + std::shared_ptr get_emu_vcvtneps2bf16() const { return emu_vcvtneps2bf16; } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp index b5ff60efed0af9..1415dc1ae95e20 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp @@ -32,7 +32,7 @@ bool MKLDNNEdge::isUseExternalMemory() const { return externalMemoryPtr; } -bool MKLDNNEdge::isDropped() { +bool MKLDNNEdge::isDropped() const { bool not_in_parent = true; bool not_in_child = true; @@ -124,6 +124,10 @@ void MKLDNNEdge::reuse(MKLDNNMemoryPtr ptr) { status = Status::Allocated; } +const InferenceEngine::TensorDesc& MKLDNNEdge::getInputDescRO() const { + return inputDesc; +} + InferenceEngine::TensorDesc MKLDNNEdge::getInputDesc() { if (inputDesc.getLayout() == InferenceEngine::Layout::ANY) { inputDesc = getSpecifiedInputDesc({}); @@ -131,6 +135,10 @@ InferenceEngine::TensorDesc MKLDNNEdge::getInputDesc() { return inputDesc; } +const InferenceEngine::TensorDesc& MKLDNNEdge::getOutputDescRO() const { + return outputDesc; +} + InferenceEngine::TensorDesc MKLDNNEdge::getOutputDesc() { if (outputDesc.getLayout() == InferenceEngine::Layout::ANY) { outputDesc = getSpecifiedOutputDesc({}); @@ -145,11 +153,11 @@ InferenceEngine::TensorDesc MKLDNNEdge::getDesc() { return getInputDesc(); } -int MKLDNNEdge::getInputNum() { +int MKLDNNEdge::getInputNum() const { return parent_port; } -int MKLDNNEdge::getOutputNum() { +int MKLDNNEdge::getOutputNum() const { return child_port; } diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h index c9884caf56e823..63e2a16414d94f 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h @@ -61,11 +61,11 @@ class MKLDNNEdge { MKLDNNMemoryPtr& getMemoryPtr(); bool needReorder(); - bool isDropped(); + bool isDropped() const; bool isUseExternalMemory() const; - int getInputNum(); - int getOutputNum(); + int getInputNum() const; + int getOutputNum() const; void setChildPort(const size_t port) { child_port = port; } @@ -73,10 +73,12 @@ class MKLDNNEdge { MKLDNNEdgePtr getSharedEdge() const; MKLDNNEdgePtr getSharedEdge(std::nothrow_t) const; + const InferenceEngine::TensorDesc& getInputDescRO() const; + const InferenceEngine::TensorDesc& getOutputDescRO() const; + private: std::string name(); -private: std::weak_ptr parent; std::weak_ptr child; int parent_port; diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp index b92afb8a9f0d4f..efc99bddb84ae3 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp @@ -78,7 +78,10 @@ void MKLDNNGraph::CreateGraph(NET &net, const MKLDNNExtensionManager::Ptr& extMg Replicate(net, extMgr); InitGraph(); + status = Ready; + + ENABLE_CPU_DEBUG_CAP(serialize(*this)); } template void MKLDNNGraph::CreateGraph(const std::shared_ptr&, @@ -344,10 +347,6 @@ void MKLDNNGraph::InitGraph() { graphNode->cleanup(); } #endif - -#if !defined(NDEBUG) && defined(PRINT_GRAPH_INFO) - printGraphInfo(); -#endif ExecuteConstantNodesOnly(); } @@ -809,7 +808,7 @@ void MKLDNNGraph::Infer(MKLDNNInferRequest* request, int batch) { mkldnn::stream stream(eng); - ENABLE_CPU_DEBUG_CAP(NodeDumper nd(infer_count)); + ENABLE_CPU_DEBUG_CAP(NodeDumper nd(config.debugCaps, infer_count)); for (int i = 0; i < graphNodes.size(); i++) { if (request != nullptr) { @@ -954,6 +953,10 @@ void MKLDNNGraph::setConfig(const Config &cfg) { config = cfg; } +const Config& MKLDNNGraph::getConfig() const { + return config; +} + void MKLDNNGraph::setProperty(const std::map& properties) { config.readProperties(properties); } @@ -1217,21 +1220,3 @@ void MKLDNNGraph::EnforceBF16() { InferenceEngine::CNNNetwork MKLDNNGraph::dump() const { return dump_graph_as_ie_ngraph_net(*this); } - -void MKLDNNGraph::printGraphInfo() const { - for (auto &graphNode : graphNodes) { - std::cout << "name: " << graphNode->getName() << " [ "; - if (graphNode->parentEdges.size() > 0) { - auto prnt_out_desc = graphNode->parentEdges[0].lock()->getOutputDesc(); - std::cout << "in: " << prnt_out_desc.getPrecision().name() - << "/l=" << prnt_out_desc.getLayout() - << "; "; - } - if (graphNode->childEdges.size() > 0) { - auto chld_in_desc = graphNode->childEdges[0].lock()->getInputDesc(); - std::cout << "out: " << chld_in_desc.getPrecision().name() - << "/l=" << chld_in_desc.getLayout(); - } - std::cout << " ]" << std::endl; - } -} diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h index c3fcb0d5c9c635..1b54f71e88c1cd 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h @@ -39,6 +39,8 @@ class MKLDNNGraph { } void setConfig(const Config &cfg); + const Config& getConfig() const; + void setProperty(const std::map &properties); Config getProperty() const; @@ -59,6 +61,10 @@ class MKLDNNGraph { void Infer(MKLDNNInferRequest* request = nullptr, int batch = -1); + const std::vector& GetNodes() const { + return graphNodes; + } + std::vector& GetNodes() { return graphNodes; } @@ -219,7 +225,6 @@ class MKLDNNGraph { private: void EnforceBF16(); - void printGraphInfo() const; }; } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp index 14d2f6a28ae7e8..ac4bfff6b6d2f0 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp @@ -5,9 +5,11 @@ #include "mkldnn_graph_dumper.h" #include #include "exec_graph_info.hpp" +#include "ie_common.h" #include "mkldnn_debug.h" #include #include "ngraph/ngraph.hpp" +#include "utils/debug_capabilities.h" #include #include @@ -18,6 +20,9 @@ using namespace InferenceEngine; namespace MKLDNNPlugin { +void serializeToCout(const MKLDNNGraph &graph); +void serializeToXML(const MKLDNNGraph &graph, const std::string& path); + namespace { std::map extract_node_metadata(const MKLDNNNodePtr &node) { @@ -207,4 +212,46 @@ InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph return net; } +#ifdef CPU_DEBUG_CAPS +void serialize(const MKLDNNGraph &graph) { + const std::string& path = graph.getConfig().debugCaps.execGraphPath; + + if (path.empty()) + return; + + if (path == "cout") + serializeToCout(graph); + else if (!path.compare(path.size() - 4, 4, ".xml")) + serializeToXML(graph, path); + else + IE_THROW() << "Unknown serialize format. Should be either 'cout' or '*.xml'. Got " << path; +} + +void serializeToXML(const MKLDNNGraph &graph, const std::string& path) { + if (path.empty()) + return; + + graph.dump().serialize(path); +} + +void serializeToCout(const MKLDNNGraph &graph) { + for (const auto& node : graph.GetNodes()) { + std::cout << "name: " << node->getName() << " [ "; + if (!node->getParentEdges().empty()) { + const auto& parentEdge = *(node->getParentEdges()[0].lock()); + const auto& prnt_out_desc = parentEdge.getOutputDescRO(); + std::cout << "in: " << prnt_out_desc.getPrecision().name() + << "/l=" << prnt_out_desc.getLayout() + << "; "; + } + if (!node->getChildEdges().empty()) { + const auto& childEdge = *(node->getChildEdges()[0].lock()); + const auto& chld_in_desc = childEdge.getInputDescRO(); + std::cout << "out: " << chld_in_desc.getPrecision().name() + << "/l=" << chld_in_desc.getLayout(); + } + std::cout << " ]" << std::endl; + } +} +#endif } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h index d954695baaa050..597568224f38f0 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h @@ -6,11 +6,14 @@ #include "cpp/ie_cnn_network.h" #include "mkldnn_graph.h" +#include "utils/debug_capabilities.h" #include namespace MKLDNNPlugin { InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph); - +#ifdef CPU_DEBUG_CAPS +void serialize(const MKLDNNGraph &graph); +#endif // CPU_DEBUG_CAPS } // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp index e2e2a3276b8c78..e46c7a7b0bdf9e 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp @@ -203,6 +203,27 @@ static const InferenceEngine::details::caseless_unordered_map { "SoftPlus", Math}, { "Softsign", Math}, { "Tan", Math}, + { "CTCLoss", CTCLoss}, + { "Bucketize", Bucketize}, + { "CTCGreedyDecoder", CTCGreedyDecoder}, + { "CTCGreedyDecoderSeqLen", CTCGreedyDecoderSeqLen}, + { "CumSum", CumSum}, + { "DetectionOutput", DetectionOutput}, + { "ExperimentalDetectronDetectionOutput", ExperimentalDetectronDetectionOutput}, + { "LogSoftmax", LogSoftmax}, + { "TopK", TopK}, + { "GatherTree", GatherTree}, + { "GRN", GRN}, + { "Range", Range}, + { "Proposal", Proposal}, + { "ReorgYolo", ReorgYolo}, + { "ReverseSequence", ReverseSequence}, + { "ExperimentalDetectronTopKROIs", ExperimentalDetectronTopKROIs}, + { "ExperimentalDetectronROIFeatureExtractor", ExperimentalDetectronROIFeatureExtractor}, + { "ExperimentalDetectronPriorGridGenerator", ExperimentalDetectronPriorGridGenerator}, + { "ExperimentalDetectronGenerateProposalsSingleImage", ExperimentalDetectronGenerateProposalsSingleImage}, + { "ExtractImagePatches", ExtractImagePatches}, + { "NonMaxSuppressionIEInternal", NonMaxSuppression} }; Type TypeFromName(const std::string type) { diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_node.h b/inference-engine/src/mkldnn_plugin/mkldnn_node.h index e5f86f03ea0c4a..29618d51fdbaf5 100644 --- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h +++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h @@ -129,7 +129,7 @@ static std::string NameFromType(Type type) { case EmbeddingBagPackedSum: return "EmbeddingBagPackedSum"; case EmbeddingBagOffsetsSum: - return "EmbeddingBagPackedSum"; + return "EmbeddingBagOffsetsSum"; case Gather: return "Gather"; case GatherElements: @@ -150,6 +150,48 @@ static std::string NameFromType(Type type) { return "DFT"; case Math: return "Math"; + case CTCLoss: + return "CTCLoss"; + case Bucketize: + return "Bucketize"; + case CTCGreedyDecoder: + return "CTCGreedyDecoder"; + case CTCGreedyDecoderSeqLen: + return "CTCGreedyDecoderSeqLen"; + case CumSum: + return "CumSum"; + case DetectionOutput: + return "DetectionOutput"; + case ExperimentalDetectronDetectionOutput: + return "ExperimentalDetectronDetectionOutput"; + case LogSoftmax: + return "LogSoftmax"; + case TopK: + return "TopK"; + case GatherTree: + return "GatherTree"; + case GRN: + return "GRN"; + case Range: + return "Range"; + case Proposal: + return "Proposal"; + case ReorgYolo: + return "ReorgYolo"; + case ReverseSequence: + return "ReverseSequence"; + case ExperimentalDetectronTopKROIs: + return "ExperimentalDetectronTopKROIs"; + case ExperimentalDetectronROIFeatureExtractor: + return "ExperimentalDetectronROIFeatureExtractor"; + case ExperimentalDetectronPriorGridGenerator: + return "ExperimentalDetectronPriorGridGenerator"; + case ExperimentalDetectronGenerateProposalsSingleImage: + return "ExperimentalDetectronGenerateProposalsSingleImage"; + case ExtractImagePatches: + return "ExtractImagePatches"; + case NonMaxSuppression: + return "NonMaxSuppression"; default: return "Unknown"; } diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp index 09d3e7e05540bd..b850bd98ae2979 100644 --- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp +++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp @@ -22,7 +22,11 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() { ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher &m) { auto fc = std::dynamic_pointer_cast(m.get_match_root()); + if (!fc) + return false; auto reshape = std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(0)); + if (!reshape) + return false; // Check that Reshape reshapes 4D tensor to 2D or input shape = output shape auto shape_in = reshape->input_value(0).get_shape(); @@ -67,6 +71,8 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() { fc->input_value(2), outShape, fc->output(0).get_element_type()); + } else { + return false; } new_ops.push_back(new_fc); new_fc->set_friendly_name(fc->get_friendly_name()); diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp index 999d1b958d8d91..f140f44e74e701 100644 --- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp +++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp @@ -60,6 +60,8 @@ MKLDNNPlugin::ReshapeFullyConnected::ReshapeFullyConnected() { fc->input_value(2), output_shape_new, fc->get_output_type()); + } else { + return false; } new_ops.push_back(fc_new); diff --git a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp index 0cc1a33cbc3283..69fd75ea57a661 100644 --- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp +++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp @@ -20,8 +20,16 @@ MKLDNNPlugin::ReshapePRelu::ReshapePRelu() { if (!prelu || ngraph::shape_size(prelu->get_input_shape(1)) == 1 || prelu->get_input_shape(1).size() != 1) { return false; } - ngraph::Shape new_shape(prelu->input_value(0).get_shape().size(), 1); - new_shape[new_shape.size() > 1 ? 1 : 0] = prelu->input_value(1).get_shape()[0]; + const auto prelu_shape = prelu->input_value(0).get_shape(); + const auto slope_shape = prelu->input_value(1).get_shape(); + ngraph::Shape new_shape(prelu_shape.size(), 1); + const auto slope_dim = slope_shape[0]; + const auto channel_dim_idx = prelu_shape.size() > 1 ? 1 : 0; + if (slope_dim != prelu_shape[channel_dim_idx]) { + return false; + } + new_shape[channel_dim_idx] = slope_dim; + auto slope = ngraph::op::util::reshapeTo(prelu->input_value(1), new_shape); auto new_prelu = std::make_shared(prelu->input(0).get_source_output(), slope); new_prelu->set_friendly_name(prelu->get_friendly_name()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp b/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp deleted file mode 100644 index febdf1a8dfd0f2..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/bucketize.cpp +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" -#include - -using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class BucketizeImpl : public ExtLayerBase { - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto bucketsize = std::dynamic_pointer_cast(op); - if (!bucketsize) { - errorMessage = "Only opset3 Bucketize operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - - std::string errorPrefix; - -public: - explicit BucketizeImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - errorPrefix = "Bucketize layer with name '" + op->get_friendly_name() + "' "; - const auto bucketsize = std::dynamic_pointer_cast(op); - - if (op->get_input_size() != 2 || op->get_output_size() != 1) { - IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; - } - - // check one attribute - with_right = bucketsize->get_with_right_bound(); - - // check precisions for input and output tensors - input_precision = details::convertPrecision(op->get_input_element_type(INPUT_TENSOR_PORT)); - if (input_precision != Precision::FP32 && input_precision != Precision::I32 && - input_precision != Precision::I64) { - input_precision = Precision::FP32; - } - boundaries_precision = details::convertPrecision(op->get_input_element_type(INPUT_BINS_PORT)); - if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 && - boundaries_precision != Precision::I64) { - boundaries_precision = Precision::FP32; - } - output_precision = details::convertPrecision(op->get_output_element_type(OUTPUT_TENSOR_PORT)); - if (output_precision != Precision::I32 && output_precision != Precision::I64) { - output_precision = Precision::I32; - } - - // check dimensions of input tensors - SizeVector input_tensor_dims = op->get_input_shape(INPUT_TENSOR_PORT); - if (input_tensor_dims.size() < 1) { - IE_THROW() << errorPrefix << " has incorrect dimensions of the input."; - } - SizeVector input_bin_dims = op->get_input_shape(INPUT_BINS_PORT); - if (input_bin_dims.size() != 1) { - IE_THROW() << errorPrefix << " has incorrect dimensions of the boundaries tensor."; - } - if (input_bin_dims[0] != 0) { - with_bins = true; - } - num_bin_values = input_bin_dims[0]; - - num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies()); - - addConfig(op, {{TensorDescCreatorTypes::ncsp, input_precision}, - {TensorDescCreatorTypes::ncsp, boundaries_precision}}, - {{TensorDescCreatorTypes::ncsp, output_precision}}); - } - catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - auto precision_mask = getPrecisionMask(input_precision, boundaries_precision, output_precision); - - switch (precision_mask) { - case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I64, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I32, Precision::I64, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I32, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I32, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I64, Precision::I32): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - case getPrecisionMask(Precision::I64, Precision::I64, Precision::I64): - bucketize::value_type, - PrecisionTrait::value_type, - PrecisionTrait::value_type>(inputs[0], inputs[1], outputs[0]); - break; - default: - return GENERAL_ERROR; - } - - return OK; - } - -private: - template - void bucketize(Blob::Ptr input, Blob::Ptr boundaries, Blob::Ptr output) { - const auto *input_data = input->cbuffer().as(); - const auto *boundaries_data = boundaries->cbuffer().as(); - auto *output_data = output->buffer().as(); - - if (with_bins == false) { - memset(output_data, 0, num_values * sizeof(T_IND)); - return; - } - - // boundaries are assumed to be sorted and to have unique elements - parallel_for(num_values, [&](size_t ind) { - T value = input_data[ind]; - if (with_right) { - auto low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value); - output_data[ind] = static_cast(low - boundaries_data); - } else { - auto up = std::upper_bound(boundaries_data, boundaries_data + num_bin_values, value); - output_data[ind] = static_cast(up - boundaries_data); - } - }); - } - - const size_t INPUT_TENSOR_PORT = 0; - const size_t INPUT_BINS_PORT = 1; - const size_t OUTPUT_TENSOR_PORT = 0; - - size_t num_values = 0; - size_t num_bin_values = 0; - bool with_right = false; - bool with_bins = false; - - Precision input_precision; - Precision boundaries_precision; - Precision output_precision; -}; - -REG_FACTORY_FOR(BucketizeImpl, Bucketize); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp deleted file mode 100644 index 0ba6ca7e960230..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder.cpp +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include "ie_parallel.hpp" -#include -#include - -#include -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CTCGreedyDecoderImpl: public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto greedyDecOp = ngraph::as_type_ptr(op); - if (!greedyDecOp) { - errorMessage = "Node is not an instance of the CTCGreedyDecoder operation from operation set v0."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit CTCGreedyDecoderImpl(const std::shared_ptr& op) : mergeRepeated_(true) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - std::string errPrefix = "CTCGreedyDecoder layer with name '" + op->get_friendly_name() + "' "; - if (op->get_input_size() != 2) - IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size(); - if (op->get_output_size() != 1) - IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size(); - - if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0] && - op->get_input_shape(DATA_INDEX)[1] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[1]) - IE_THROW() << errPrefix << "has invalid input shapes."; - - Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX)); - if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) - IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision; - - Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX)); - if (seqLenPrecision != Precision::FP32 && seqLenPrecision != Precision::BF16) - IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; - - auto greedyDecOp = ngraph::as_type_ptr(op); - mergeRepeated_ = greedyDecOp->get_ctc_merge_repeated(); - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::FP32}}, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - const float* probabilities = inputs[DATA_INDEX]->cbuffer().as() + - inputs[DATA_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const float* sequenceMask = inputs[SEQUENCE_LENGTH_INDEX]->cbuffer().as() + - inputs[SEQUENCE_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* outputSequences = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - const size_t T = inputs[DATA_INDEX]->getTensorDesc().getDims()[0]; - const size_t B = inputs[DATA_INDEX]->getTensorDesc().getDims()[1]; - const int C = inputs[DATA_INDEX]->getTensorDesc().getDims()[2]; - const size_t BC = B * C; - const size_t CB1 = C * (B - 1); - - const int blankIndex = C - 1; - - std::vector sequenceLengths(B, 0); - parallel_for(B, [&](size_t b) { - size_t t = 0; - for (; t < T; t++) { - if (sequenceMask[B * t + b] == 0.f) - break; - } - sequenceLengths[b] = t; - }); - - size_t workAmount = 0; - for (size_t b = 0; b < B; b++) { - workAmount += sequenceLengths[b]; - } - - // Parallelization could not be made directly by T due to output index depends on merged classes and - // blank index, thus could not be shared between threads. Better to divide operation on two steps. - // At the first stage find the maximum index. At second stage merge if needed. - // Such approach makes parallelization more efficient. - auto threadBody = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(workAmount, nthr, ithr, start, end); - if (start >= end) - return; - size_t tStart = 0lu, bStart = 0lu; - for (; bStart < B; bStart++) { - tStart += sequenceLengths[bStart]; - if (tStart >= start) { - tStart = start - (tStart - sequenceLengths[bStart]); - break; - } - } - - size_t workCounter = start; - - for (size_t b = bStart; b < B; ++b) { - size_t outputIndex = b * T + tStart; - const float* probs = probabilities + b * C + BC * tStart; - size_t sequenceLength = sequenceLengths[b]; - - for (size_t t = tStart; t < sequenceLength; ++t) { - int maxClassIdx = 0; - - float maxProb = probs[0]; - ++probs; - - for (int c = 1; c < C; ++c, ++probs) { - if (*probs > maxProb) { - maxClassIdx = c; - maxProb = *probs; - } - } - probs += CB1; - outputSequences[outputIndex++] = static_cast(maxClassIdx); - - if (++workCounter >= end) { - return; - } - } - tStart = 0lu; - } - }; // thread body - - parallel_nt(0, threadBody); - - parallel_for(B, [&](size_t b) { - int prevClassIdx = -1; - size_t outputIndex = b * T; - const size_t sequenceLength = sequenceLengths[b]; - float* shiftedOut = outputSequences + b * T; - for (size_t t = 0; t < sequenceLength; ++t) { - if (*shiftedOut < blankIndex && - !(mergeRepeated_ && *shiftedOut == prevClassIdx)) { - outputSequences[outputIndex++] = *shiftedOut; - } - prevClassIdx = *shiftedOut; - shiftedOut++; - } - std::fill(outputSequences + outputIndex, outputSequences + (b + 1) * T, -1.f); - }); - - return OK; - } - -private: - const size_t DATA_INDEX = 0lu; - const size_t SEQUENCE_LENGTH_INDEX = 1lu; - bool mergeRepeated_; -}; - -REG_FACTORY_FOR(CTCGreedyDecoderImpl, CTCGreedyDecoder); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp deleted file mode 100644 index c60684ee0af3f8..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_greedy_decoder_seq_len.cpp +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include "ie_parallel.hpp" -#include -#include - -#include -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CTCGreedyDecoderSeqLenImpl: public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto greedyDecOp = ngraph::as_type_ptr(op); - if (!greedyDecOp) { - errorMessage = "Node is not an instance of the CTCGreedyDecoderSeqLen operation from operation set v6."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit CTCGreedyDecoderSeqLenImpl(const std::shared_ptr& op) : mergeRepeated_(true) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - std::string errPrefix = "CTCGreedyDecoderSeqLen layer with name '" + op->get_friendly_name() + "' "; - if (op->get_input_size() < 2 || op->get_input_size() > 3) - IE_THROW() << errPrefix << "has invalid number of input edges: " << op->get_input_size(); - if (op->get_output_size() != 2) - IE_THROW() << errPrefix << "has invalid number of outputs edges: " << op->get_output_size(); - - if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0]) - IE_THROW() << errPrefix << "has invalid input shapes."; - - Precision inDataPrecision = details::convertPrecision(op->get_input_element_type(DATA_INDEX)); - if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) - IE_THROW() << errPrefix << "has unsupported 'data' input precision: " << inDataPrecision; - - Precision seqLenPrecision = details::convertPrecision(op->get_input_element_type(SEQUENCE_LENGTH_INDEX)); - if (seqLenPrecision != Precision::I32 && seqLenPrecision != Precision::I64) - IE_THROW() << errPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; - - auto greedyDecOp = ngraph::as_type_ptr(op); - mergeRepeated_ = greedyDecOp->get_merge_repeated(); - - if (op->get_input_size() == BLANK_INDEX) { - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}, - {{TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}); - } else { - Precision blIdxPrecision = details::convertPrecision(op->get_input_element_type(BLANK_INDEX)); - if (blIdxPrecision != Precision::I32 && blIdxPrecision != Precision::I64) - IE_THROW() << errPrefix << "has unsupported 'blank_index' input precision: " << blIdxPrecision; - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}, - {{TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}}); - } - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - const float* probabilities = inputs[DATA_INDEX]->cbuffer().as() + - inputs[DATA_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* sequenceLengths = inputs[SEQUENCE_LENGTH_INDEX]->cbuffer().as() + - inputs[SEQUENCE_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - int* decodedClasses = outputs[DECODED_CLASSES_INDEX]->buffer().as() + - outputs[DECODED_CLASSES_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - int* decodedClassesLength = outputs[DECODED_CLASSES_LENGTH_INDEX]->buffer().as() + - outputs[DECODED_CLASSES_LENGTH_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - const auto& inDims = inputs[DATA_INDEX]->getTensorDesc().getDims(); - const size_t B = inDims[0]; - const size_t T = inDims[1]; - const int C = inDims[2]; - const size_t TC = T * C; - - int blankIndex = C - 1; - if (inputs.size() > BLANK_INDEX) - blankIndex = (inputs[BLANK_INDEX]->cbuffer().as() + - inputs[BLANK_INDEX]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; - - size_t workAmount = 0; - for (size_t b = 0; b < B; b++) { - if (sequenceLengths[b] > T) { - if (resp) { - std::string errorMsg = errPrefix - + ". Sequence length " + std::to_string(sequenceLengths[b]) - + " cannot be greater than according decoded classes dimension size " - + std::to_string(outputs[DECODED_CLASSES_INDEX]->getTensorDesc().getDims()[1]); - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return PARAMETER_MISMATCH; - } - workAmount += sequenceLengths[b]; - } - // Parallelization could not be made directly by T due to output index depends on merged classes and - // blank index, thus could not be shared between threads. Better to divide operation on two steps. - // At the first stage find the maximum index. At second stage merge if needed. - // Such approach makes parallelization more efficient. - auto threadBody = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(workAmount, nthr, ithr, start, end); - if (start >= end) - return; - size_t tStart = 0lu, bStart = 0lu; - for (; bStart < B; bStart++) { - tStart += sequenceLengths[bStart]; - if (tStart >= start) { - tStart = start - (tStart - sequenceLengths[bStart]); - break; - } - } - - size_t workCounter = start; - - for (size_t b = bStart; b < B; ++b) { - size_t outputIndex = b * T + tStart; - const float* probs = probabilities + b * TC + C * tStart; - const size_t actualSeqLen = sequenceLengths[b]; - - for (size_t t = tStart; t < actualSeqLen; ++t) { - int maxClassIdx = 0; - float maxProb = probs[0]; - probs++; - - for (int c = 1; c < C; c++, probs++) { - if (*probs > maxProb) { - maxClassIdx = c; - maxProb = *probs; - } - } - decodedClasses[outputIndex++] = maxClassIdx; - - if (++workCounter >= end) { - return; - } - } - tStart = 0lu; - } - }; // thread body - - parallel_nt(0, threadBody); - - parallel_for(B, [&](size_t b) { - int prevClassIdx = -1; - size_t outputIndex = b * T; - const size_t actualSeqLen = sequenceLengths[b]; - int* shiftedOut = decodedClasses + b * T; - - for (size_t t = 0; t < actualSeqLen; ++t) { - if (*shiftedOut != blankIndex && - !(mergeRepeated_ && *shiftedOut == prevClassIdx)) { - decodedClasses[outputIndex++] = *shiftedOut; - } - prevClassIdx = *shiftedOut; - shiftedOut++; - } - std::fill(decodedClasses + outputIndex, decodedClasses + (b + 1) * T, -1); - decodedClassesLength[b] = outputIndex - b * T; - }); - - return OK; - } - -private: - const size_t DATA_INDEX = 0lu; - const size_t SEQUENCE_LENGTH_INDEX = 1lu; - const size_t BLANK_INDEX = 2lu; - const size_t DECODED_CLASSES_INDEX = 0lu; - const size_t DECODED_CLASSES_LENGTH_INDEX = 1lu; - bool mergeRepeated_; - std::string errPrefix; -}; - -REG_FACTORY_FOR(CTCGreedyDecoderSeqLenImpl, CTCGreedyDecoderSeqLen); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp b/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp deleted file mode 100644 index 84d6b55a1a47e9..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/ctc_loss.cpp +++ /dev/null @@ -1,302 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include "ie_parallel.hpp" -#include -#include - -#include - - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CTCLossImpl : public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto ctcLossOp = ngraph::as_type_ptr(op); - if (!ctcLossOp) { - errorMessage = "Node is not an instance of the CTCLoss operation from operation set v4."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit CTCLossImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - _logPrefix = std::string("CTCLoss layer with name '") + op->get_friendly_name() + "'"; - - if (op->get_input_size() != 4 && op->get_input_size() != 5) - IE_THROW() << _logPrefix << " has invalid inputs number."; - - auto ctcLossOp = ngraph::as_type_ptr(op); - _ctcMergeRepeated = ctcLossOp->get_ctc_merge_repeated(); - _preprocessCollapseRepeated = ctcLossOp->get_preprocess_collapse_repeated(); - _unique = ctcLossOp->get_unique(); - - std::vector inDataConfigurators; - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::FP32}); - for (int i = 1; i < op->get_input_size(); i++) { - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32}); - } - addConfig(op, inDataConfigurators, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, - std::vector& outputs, - ResponseDesc *resp) noexcept override { - StatusCode returnCode = OK; - - const float* logits = inputs[0]->cbuffer().as() + - inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* logitsLength = inputs[1]->cbuffer().as() + - inputs[1]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* labels = inputs[2]->cbuffer().as() + - inputs[2]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const int* labelsLength = inputs[3]->cbuffer().as() + - inputs[3]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* dstData = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - const auto& logitsShape = inputs[0]->getTensorDesc().getDims(); - const size_t batchNum = logitsShape[0]; - const size_t maxTime = logitsShape[1]; - const size_t classesNum = logitsShape[2]; - - int blankIndex = classesNum - 1; - if (inputs.size() > 4) { - blankIndex = inputs[4]->cbuffer().as()[0]; - } - - std::vector decodedTargetLenB(batchNum, 0); - std::vector> targetDB(batchNum); - std::vector>> logProbabilitiesB(batchNum); - std::vector errorMsgB(parallel_get_max_threads()); - - auto threadBody_1 = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(batchNum, nthr, ithr, start, end); - if (start >= end) - return; - - for (size_t b = start; b < end; b++) { - if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > maxTime || labelsLength[b] > logitsLength[b]) { - errorMsgB[ithr] = _logPrefix + ". Logit length cannot be greater than max sequence length. " - + "Label length cannot be greater than a logit length" - + " and both cannot be negative.\nMaxSeqLen: " - + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b]) - + "; Label len: " + std::to_string(labelsLength[b]); - returnCode = GENERAL_ERROR; - return; - } - const size_t actualLogitLen = logitsLength[b]; - const size_t actualTargetLen = labelsLength[b]; - size_t decodedTargetLen = 0lu; - - // Decoding target: merge repeated characters if preprocess_collapse_repeated == True, - // find unique elemnts if unique == True. - // Inserts blanks before each index and a blank at the end. - const int* target = &labels[b * maxTime]; - targetDB[b].resize(actualTargetLen * 2 + 1); - auto& targetD = targetDB[b]; - if (_unique) { - std::unordered_set uniqVals; - for (size_t t = 0lu; t < actualTargetLen; t++) { - if (uniqVals.find(target[t]) != uniqVals.end()) { - continue; - } - uniqVals.insert(target[t]); - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = target[t]; - } - targetD[decodedTargetLen++] = blankIndex; - } else if (_preprocessCollapseRepeated) { - auto prevValue = target[0]; - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = target[0]; - for (size_t t = 1lu; t < actualTargetLen; t++) { - if (target[t] == prevValue) { - continue; - } - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = prevValue = target[t]; - } - targetD[decodedTargetLen++] = blankIndex; - } else { - for (size_t t = 0lu; t < actualTargetLen; t++) { - targetD[decodedTargetLen++] = blankIndex; - targetD[decodedTargetLen++] = target[t]; - } - targetD[decodedTargetLen++] = blankIndex; - } - decodedTargetLenB[b] = decodedTargetLen; - - auto& logProbabilities = logProbabilitiesB[b]; - logProbabilities.resize(actualLogitLen); - for (size_t ll = 0; ll < actualLogitLen; ll++) { - logProbabilities[ll].resize(decodedTargetLen); - } - } // for batch - }; // threadBody_1 - - parallel_nt(0, threadBody_1); - if (returnCode != OK) { - std::string resErr(""); - for (auto& err : errorMsgB) { - if (!err.empty()) - resErr += err + "\n"; - resErr.copy(resp->msg, sizeof(resp->msg) - 1); - } - return returnCode; - } - - const size_t TC = maxTime * classesNum; - - size_t workAmount2 = 0lu; - for (size_t b = 0; b < batchNum; b++) { - workAmount2 += logitsLength[b]; - } - - auto threadBody_2 = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - size_t sB(0lu), sT(0lu); - splitter(workAmount2, nthr, ithr, start, end); - if (start >= end) - return; - int64_t cw = 0, st = start; - for (; sB < batchNum; sB++) { - cw += logitsLength[sB]; - if (cw >= st) { - sT = logitsLength[sB] + st - cw; - break; - } - } - size_t workCounter = start; - - for (size_t b = sB; b < batchNum; b++) { - const size_t actualLogitLen = logitsLength[b]; - const size_t decodedTargetLen = decodedTargetLenB[b]; - auto& logProbabilities = logProbabilitiesB[b]; - auto& targetD = targetDB[b]; - - double expSum = 0.0; - size_t btcT = b * TC + sT * classesNum; - // logProbabilities = logSoftmax = logits[b][t][c] - ln(sum_c(exp(logits[b][t]))) - for (size_t t = sT; t < actualLogitLen; t++) { - expSum = 0.0; - for (size_t c = 0lu; c < classesNum; c++) { - expSum += std::exp(logits[btcT + c]); - } - for (size_t s = 0lu; s < decodedTargetLen; s++) { - logProbabilities[t][s] = logits[btcT + targetD[s]] - std::log(expSum); - } - btcT += classesNum; - if (++workCounter >= end) { - return; - } - } - sT = 0lu; - } // for batch - }; // threadBody_2 - - parallel_nt(0, threadBody_2); - - const auto float_inf = std::numeric_limits::infinity(); - - auto sumLogs = [&float_inf](float log1, float log2) { - if (log1 == -float_inf) { - return log2; - } else if (log2 == -float_inf) { - return log1; - } else { - if (log1 > log2) - return log1 + std::log1pf(std::exp(log2 - log1)); - else - return log2 + std::log1pf(std::exp(log1 - log2)); - } - }; - - auto threadBody_3 = [&](const int ithr, const int nthr) { - size_t start(0lu), end(0lu); - splitter(batchNum, nthr, ithr, start, end); - if (start >= end) - return; - - // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: - // Graves et al., 2016, paragraph 4.1 (10) - for (size_t b = start; b < end; b++) { - auto& targetD = targetDB[b]; - auto& logProbabilities = logProbabilitiesB[b]; - const int actualLogitLen = logitsLength[b]; - const int decodedTargetLen = decodedTargetLenB[b]; - std::vector> logBwd(decodedTargetLen, std::vector(actualLogitLen, -float_inf)); - for (int s = decodedTargetLen - 2; s < decodedTargetLen; s++) - logBwd[s][actualLogitLen - 1] = 0.f; - - for (int t = actualLogitLen - 2; t >= 0; t--) { - const int t_1 = t + 1; - for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t))); - s < std::min(decodedTargetLen, 2 * (t_1)); s++) { - if (_ctcMergeRepeated || targetD[s] == blankIndex) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s][t_1] + logProbabilities[t_1][s]); - } - - if (s + 1 < decodedTargetLen) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); - } - - if (s + 2 < decodedTargetLen) { - if (targetD[s] != blankIndex && (!_ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) { - logBwd[s][t] = sumLogs(logBwd[s][t], - logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); - } - } - } - } - - logBwd[0][0] += logProbabilities[0][0]; - logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0]; - - dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]); - } // for batch - }; // threadBody_3 - - parallel_nt(0, threadBody_3); - - return returnCode; - } // execute - -protected: - bool _ctcMergeRepeated; - bool _preprocessCollapseRepeated; - bool _unique; - - std::string _logPrefix; -}; - -REG_FACTORY_FOR(CTCLossImpl, CTCLoss); -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp b/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp deleted file mode 100644 index 8940527713cd36..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/cum_sum.cpp +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "list.hpp" -#include "base.hpp" - -#include -#include -#include "ie_parallel.hpp" -#include "ie_precision.hpp" -#include -#include -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class CumSumImpl: public ExtLayerBase { - enum { CUM_SUM_DATA, AXIS, numOfInputs }; - bool exclusive; - bool reverse; - size_t numOfDims; - size_t axis = 0; - std::vector shape; - - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto cumsum = std::dynamic_pointer_cast(op); - if (!cumsum) { - errorMessage = "Only opset3 CumSum operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - -public: - explicit CumSumImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - layerName = op->get_friendly_name(); - if ((op->get_input_size() != numOfInputs && op->get_input_size() != (numOfInputs - 1)) || op->get_output_size() != 1) - IE_THROW() << "CumSum layer with name '" << layerName << "' has incorrect number of input/output edges!"; - - const auto &dataShape = op->get_input_shape(CUM_SUM_DATA); - if (dataShape.size() < 1) { - IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'data' input tensor with rank: " << dataShape.size(); - } - numOfDims = dataShape.size(); - - const auto cumsum = std::dynamic_pointer_cast(op); - exclusive = cumsum->is_exclusive(); - reverse = cumsum->is_reverse(); - - auto dataPrecision = details::convertPrecision(cumsum->get_input_element_type(CUM_SUM_DATA)); - if (dataPrecision != Precision::I8 && dataPrecision != Precision::U8 && dataPrecision != Precision::I16 && dataPrecision != Precision::I32 && - dataPrecision != Precision::FP32 && dataPrecision != Precision::I64 && dataPrecision != Precision::U64 && dataPrecision != Precision::BF16) - IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'data' input precision: " << dataPrecision.name(); - - if (cumsum->get_input_size() == numOfInputs) { - const auto& axisTensorPrec = details::convertPrecision(cumsum->get_input_element_type(AXIS)); - if (axisTensorPrec != Precision::I32 && axisTensorPrec != Precision::I64) - IE_THROW() << "CumSum layer with name '" << layerName << "' has unsupported 'axis' input precision: " << axisTensorPrec.name(); - - if (!ngraph::is_scalar(cumsum->get_input_shape(AXIS))) - IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'axis' input tensor with non scalar rank"; - } - - if (dataShape != cumsum->get_output_shape(0)) - IE_THROW() << "CumSum layer with name '" << layerName << "' has different 'data' input and output dimensions"; - - shape = dataShape; - - std::vector inDataConfigurators; - if (dataPrecision == Precision::BF16) - dataPrecision = Precision::FP32; - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, dataPrecision}); - if (op->get_input_size() > 1) - inDataConfigurators.push_back({TensorDescCreatorTypes::ncsp, Precision::I32}); - addConfig(op, inDataConfigurators, {{TensorDescCreatorTypes::ncsp, dataPrecision}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - if (inputs.size() == numOfInputs) - axis = getAxis(inputs[AXIS], inputs[CUM_SUM_DATA]); - - const auto &dataPrecision = inputs[CUM_SUM_DATA]->getTensorDesc().getPrecision(); - switch (dataPrecision) { - case Precision::I8 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::U8 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::I16 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::I32 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::FP32 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::I64 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - case Precision::U64 : { execImpl(inputs[CUM_SUM_DATA], outputs[0]); break; } - default : { - if (resp) { - std::string errorMsg = "CumSum layer with name '" + layerName + "' has unsupported 'data' input precision: " + dataPrecision.name(); - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return GENERAL_ERROR; - } - } - return OK; - } - -private: - template - void execImpl(const Blob::CPtr& _input, const Blob::Ptr& _output) { - const auto *input = _input->cbuffer().as() + _input->getTensorDesc().getBlockingDesc().getOffsetPadding(); - auto *output = _output->buffer().as() + _output->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const std::vector strides = _input->getTensorDesc().getBlockingDesc().getStrides(); - - if (reverse) { - if (exclusive) { - cumSum(input, output, strides); - } else { - cumSum(input, output, strides); - } - } else { - if (exclusive) { - cumSum(input, output, strides); - } else { - cumSum(input, output, strides); - } - } - } - - template - void cumSum(const dataType *input, dataType *output, const std::vector &strides) { - SizeVector iterationRange(numOfDims - 1); - size_t j = 0; - for (size_t i = 0; i < shape.size(); i++) { - if (i == axis) - continue; - iterationRange[j++] = shape[i]; - } - size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), 1, std::multiplies()); - parallel_nt(0, [&](const int ithr, const int nthr) { - size_t start = 0, end = 0; - SizeVector counters(numOfDims - 1, 0); - splitter(work_amount_dst, nthr, ithr, start, end); - - parallelItInit(start, counters, iterationRange); - - for (size_t iwork = start; iwork < end; ++iwork) { - std::vector forStartOffset(numOfDims); - forStartOffset[axis] = 0; - for (size_t offsetIdx = 0, countersIdx = 0; offsetIdx < numOfDims; ++offsetIdx) { - if (offsetIdx == axis) { - continue; - } - forStartOffset[offsetIdx] = counters[countersIdx++]; - } - - size_t startOffset = getStartOffset(forStartOffset, strides); - - const dataType *inputStart = input + startOffset; - dataType *outputStart = output + startOffset; - - size_t offset = strides[axis]; - if (reverse) { - if (exclusive) { - outputStart[offset*(shape[axis] - 1)] = 0; - for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset]; - } - } else { - outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; - for (int64_t i = shape[axis] - 2; i >= 0; i--) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset]; - } - } - } else { - if (exclusive) { - outputStart[0] = 0; - for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset]; - } - } else { - outputStart[0] = inputStart[0]; - for (size_t i = 1; i < shape[axis]; i++) { - outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset]; - } - } - } - - parallelItStep(counters, iterationRange); - } - }); - } - - void parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange) { - auto itCounter = counters.rbegin(); - auto itWork = iterationRange.rbegin(); - while (itCounter != counters.rend() && itWork != iterationRange.rend()) { - *itCounter = start % *itWork; - start /= *itWork; - ++itCounter; - ++itWork; - } - } - - inline void parallelItStep(std::vector& counters, const std::vector& iterationRange) { - auto itCounter = counters.rbegin(); - auto itWork = iterationRange.rbegin(); - - while (itCounter != counters.rend() && itWork != iterationRange.rend()) { - *itCounter = (*itCounter + 1) % *itWork; - if (*itCounter != 0) { - break; - } - ++itCounter; - ++itWork; - } - } - - inline size_t getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const { - size_t startOffset = 0; - for (size_t idx = 0; idx < forStartOffset.size(); ++idx) { - startOffset += forStartOffset[idx] * strides[idx]; - } - return startOffset; - } - - size_t getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) const { - const auto& axisPrecision = _axis->getTensorDesc().getPrecision(); - const int64_t dataShapeSize = static_cast(_data->getTensorDesc().getDims().size()); - int64_t axisValueFromBlob; - switch (axisPrecision) { - case Precision::I32 : { - const auto *axisPtr = _axis->cbuffer().as(); - axisValueFromBlob = static_cast(axisPtr[0]); - break; - } - case Precision::I64 : { - const auto *axisPtr = _axis->cbuffer().as(); - axisValueFromBlob = axisPtr[0]; - break; - } - default : { - IE_THROW() << "CumSum layer with name '" << layerName << "' doesn't support 'axis' input with precision: " << axisPrecision.name(); - } - } - if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1) - IE_THROW() << "CumSum layer with name '" << layerName << "' has axis with a value out of range: " << axisValueFromBlob; - return axisValueFromBlob >= 0 ? axisValueFromBlob : (axisValueFromBlob + dataShapeSize); - } - -private: - std::string layerName; -}; - -REG_FACTORY_FOR(CumSumImpl, CumSum); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine \ No newline at end of file diff --git a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp deleted file mode 100644 index bd3b1da8fc878c..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp +++ /dev/null @@ -1,663 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include "caseless.hpp" -#include "ie_parallel.hpp" -#include "common/tensor_desc_creator.h" -#include - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -template -static bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { - return pair1.first > pair2.first; -} - -class DetectionOutputImpl: public ExtLayerBase { -public: - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto doOp = ngraph::as_type_ptr(op); - if (!doOp) { - errorMessage = "Node is not an instance of the DetectionOutput from the operations set v0."; - return false; - } - if (!details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CENTER_SIZE") && - !details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CORNER")) { - errorMessage = "Unsupported code_type attribute."; - return false; - } - } catch (...) { - return false; - } - return true; - } - - explicit DetectionOutputImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - if (op->get_input_size() != 3 && op->get_input_size() != 5) - IE_THROW() << "Invalid number of input edges."; - - if (op->get_output_size() != 1) - IE_THROW() << "Invalid number of output edges."; - - auto doOp = ngraph::as_type_ptr(op); - auto attributes = doOp->get_attrs(); - - _num_classes = attributes.num_classes; - _background_label_id = attributes.background_label_id; - _top_k = attributes.top_k; - _variance_encoded_in_target = attributes.variance_encoded_in_target; - _keep_top_k = attributes.keep_top_k[0]; - _nms_threshold = attributes.nms_threshold; - _confidence_threshold = attributes.confidence_threshold; - _share_location = attributes.share_location; - _clip_before_nms = attributes.clip_before_nms; - _clip_after_nms = attributes.clip_after_nms; - _decrease_label_id = attributes.decrease_label_id; - _normalized = attributes.normalized; - _image_height = attributes.input_height; - _image_width = attributes.input_width; - _prior_size = _normalized ? 4 : 5; - _offset = _normalized ? 0 : 1; - _num_loc_classes = _share_location ? 1 : _num_classes; - - with_add_box_pred = op->get_input_size() == 5; - _objectness_score = attributes.objectness_score; - - _code_type = (details::CaselessEq()(attributes.code_type, "caffe.PriorBoxParameter.CENTER_SIZE") ? - CodeType::CENTER_SIZE : CodeType::CORNER); - - _num_priors = static_cast(op->get_input_shape(idx_priors).back() / _prior_size); - _priors_batches = op->get_input_shape(idx_priors).front() != 1; - - if (_num_priors * _num_loc_classes * 4 != static_cast(op->get_input_shape(idx_location)[1])) - IE_THROW() << "Number of priors must match number of location predictions (" - << _num_priors * _num_loc_classes * 4 << " vs " - << op->get_input_shape(idx_location)[1] << ")"; - - if (_num_priors * _num_classes != static_cast(op->get_input_shape(idx_confidence).back())) - IE_THROW() << "Number of priors must match number of confidence predictions."; - - if (_decrease_label_id && _background_label_id != 0) - IE_THROW() << "Cannot use decrease_label_id and background_label_id parameter simultaneously."; - - _num = static_cast(op->get_input_shape(idx_confidence)[0]); - - _decoded_bboxes.resize(_num * _num_classes * _num_priors * 4); - _buffer.resize(_num * _num_classes * _num_priors); - _indices.resize(_num * _num_classes * _num_priors); - _detections_count.resize(_num * _num_classes); - _bbox_sizes.resize(_num * _num_classes * _num_priors); - _num_priors_actual.resize(_num); - - const auto &confSize = op->get_input_shape(idx_confidence); - _reordered_conf.resize(std::accumulate(confSize.begin(), confSize.end(), 1, std::multiplies())); - - std::vector inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32}); - addConfig(op, inDataConfigurators, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - float *dst_data = outputs[0]->buffer(); - - const float *loc_data = inputs[idx_location]->buffer().as(); - const float *conf_data = inputs[idx_confidence]->buffer().as(); - const float *prior_data = inputs[idx_priors]->buffer().as(); - const float *arm_conf_data = inputs.size() > 3 ? inputs[idx_arm_confidence]->buffer().as() : nullptr; - const float *arm_loc_data = inputs.size() > 4 ? inputs[idx_arm_location]->buffer().as() : nullptr; - - const int N = inputs[idx_confidence]->getTensorDesc().getDims()[0]; - - float *decoded_bboxes_data = _decoded_bboxes.data(); - float *reordered_conf_data = _reordered_conf.data(); - float *bbox_sizes_data = _bbox_sizes.data(); - int *detections_data = _detections_count.data(); - int *buffer_data = _buffer.data(); - int *indices_data = _indices.data(); - int *num_priors_actual = _num_priors_actual.data(); - - for (int n = 0; n < N; ++n) { - const float *ppriors = prior_data; - const float *prior_variances = prior_data + _num_priors*_prior_size; - if (_priors_batches) { - ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size; - prior_variances += _variance_encoded_in_target ? 0 : 2*n*_num_priors*_prior_size; - } - - if (_share_location) { - const float *ploc = loc_data + n*4*_num_priors; - float *pboxes = decoded_bboxes_data + n*4*_num_priors; - float *psizes = bbox_sizes_data + n*_num_priors; - - if (with_add_box_pred) { - const float *p_arm_loc = arm_loc_data + n*4*_num_priors; - decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); - } else { - decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - } - } else { - for (int c = 0; c < _num_loc_classes; ++c) { - if (c == _background_label_id) { - continue; - } - const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4; - float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors; - float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors; - if (with_add_box_pred) { - const float *p_arm_loc = arm_loc_data + n*4*_num_loc_classes*_num_priors + c*4; - decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); - } else { - decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); - } - } - } - } - - if (with_add_box_pred) { - for (int n = 0; n < N; ++n) { - for (int p = 0; p < _num_priors; ++p) { - if (arm_conf_data[n*_num_priors*2 + p * 2 + 1] < _objectness_score) { - for (int c = 0; c < _num_classes; ++c) { - reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = c == _background_label_id ? 1.0f : 0.0f; - } - } else { - for (int c = 0; c < _num_classes; ++c) { - reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; - } - } - } - } - } else { - for (int n = 0; n < N; ++n) { - for (int c = 0; c < _num_classes; ++c) { - for (int p = 0; p < _num_priors; ++p) { - reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; - } - } - } - } - - memset(detections_data, 0, N*_num_classes*sizeof(int)); - - for (int n = 0; n < N; ++n) { - int detections_total = 0; - - if (!_decrease_label_id) { - // Caffe style - parallel_for(_num_classes, [&](int c) { - if (c != _background_label_id) { // Ignore background class - int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; - int *pbuffer = buffer_data + c*_num_priors; - int *pdetections = detections_data + n*_num_classes + c; - - const float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; - const float *pboxes; - const float *psizes; - if (_share_location) { - pboxes = decoded_bboxes_data + n*4*_num_priors; - psizes = bbox_sizes_data + n*_num_priors; - } else { - pboxes = decoded_bboxes_data + n*4*_num_classes*_num_priors + c*4*_num_priors; - psizes = bbox_sizes_data + n*_num_classes*_num_priors + c*_num_priors; - } - - nms_cf(pconf, pboxes, psizes, pbuffer, pindices, *pdetections, num_priors_actual[n]); - } - }); - } else { - // MXNet style - int *pindices = indices_data + n*_num_classes*_num_priors; - int *pbuffer = buffer_data; - int *pdetections = detections_data + n*_num_classes; - - const float *pconf = reordered_conf_data + n*_num_classes*_num_priors; - const float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors; - const float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors; - - nms_mx(pconf, pboxes, psizes, pbuffer, pindices, pdetections, _num_priors); - } - - for (int c = 0; c < _num_classes; ++c) { - detections_total += detections_data[n*_num_classes + c]; - } - - if (_keep_top_k > -1 && detections_total > _keep_top_k) { - std::vector>> conf_index_class_map; - - for (int c = 0; c < _num_classes; ++c) { - int detections = detections_data[n*_num_classes + c]; - int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; - - float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; - - for (int i = 0; i < detections; ++i) { - int idx = pindices[i]; - conf_index_class_map.push_back(std::make_pair(pconf[idx], std::make_pair(c, idx))); - } - } - - std::sort(conf_index_class_map.begin(), conf_index_class_map.end(), - SortScorePairDescend>); - conf_index_class_map.resize(_keep_top_k); - - // Store the new indices. - memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int)); - - for (size_t j = 0; j < conf_index_class_map.size(); ++j) { - int label = conf_index_class_map[j].second.first; - int idx = conf_index_class_map[j].second.second; - int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors; - pindices[detections_data[n*_num_classes + label]] = idx; - detections_data[n*_num_classes + label]++; - } - } - } - - const int num_results = outputs[0]->getTensorDesc().getDims()[2]; - const int DETECTION_SIZE = outputs[0]->getTensorDesc().getDims()[3]; - if (DETECTION_SIZE != 7) { - return NOT_IMPLEMENTED; - } - - int dst_data_size = 0; - if (_keep_top_k > 0) - dst_data_size = N * _keep_top_k * DETECTION_SIZE * sizeof(float); - else if (_top_k > 0) - dst_data_size = N * _top_k * _num_classes * DETECTION_SIZE * sizeof(float); - else - dst_data_size = N * _num_classes * _num_priors * DETECTION_SIZE * sizeof(float); - - if (dst_data_size > outputs[0]->byteSize()) { - return OUT_OF_BOUNDS; - } - memset(dst_data, 0, dst_data_size); - - int count = 0; - for (int n = 0; n < N; ++n) { - const float *pconf = reordered_conf_data + n * _num_priors * _num_classes; - const float *pboxes = decoded_bboxes_data + n*_num_priors*4*_num_loc_classes; - const int *pindices = indices_data + n*_num_classes*_num_priors; - - for (int c = 0; c < _num_classes; ++c) { - for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) { - int idx = pindices[c*_num_priors + i]; - - dst_data[count * DETECTION_SIZE + 0] = static_cast(n); - dst_data[count * DETECTION_SIZE + 1] = static_cast(_decrease_label_id ? c-1 : c); - dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx]; - - float xmin = _share_location ? pboxes[idx*4 + 0] : - pboxes[c*4*_num_priors + idx*4 + 0]; - float ymin = _share_location ? pboxes[idx*4 + 1] : - pboxes[c*4*_num_priors + idx*4 + 1]; - float xmax = _share_location ? pboxes[idx*4 + 2] : - pboxes[c*4*_num_priors + idx*4 + 2]; - float ymax = _share_location ? pboxes[idx*4 + 3] : - pboxes[c*4*_num_priors + idx*4 + 3]; - - if (_clip_after_nms) { - xmin = (std::max)(0.0f, (std::min)(1.0f, xmin)); - ymin = (std::max)(0.0f, (std::min)(1.0f, ymin)); - xmax = (std::max)(0.0f, (std::min)(1.0f, xmax)); - ymax = (std::max)(0.0f, (std::min)(1.0f, ymax)); - } - - dst_data[count * DETECTION_SIZE + 3] = xmin; - dst_data[count * DETECTION_SIZE + 4] = ymin; - dst_data[count * DETECTION_SIZE + 5] = xmax; - dst_data[count * DETECTION_SIZE + 6] = ymax; - - ++count; - } - } - } - - if (count < num_results) { - // marker at end of boxes list - dst_data[count * DETECTION_SIZE + 0] = -1; - } - - return OK; - } - -private: - const int idx_location = 0; - const int idx_confidence = 1; - const int idx_priors = 2; - const int idx_arm_confidence = 3; - const int idx_arm_location = 4; - - int _num_classes = 0; - int _background_label_id = 0; - int _top_k = 0; - int _variance_encoded_in_target = 0; - int _keep_top_k = 0; - int _code_type = 0; - - bool _share_location = false; - bool _clip_before_nms = false; // clip bounding boxes before nms step - bool _clip_after_nms = false; // clip bounding boxes after nms step - bool _decrease_label_id = false; - - bool with_add_box_pred = false; - - int _image_width = 0; - int _image_height = 0; - int _prior_size = 4; - bool _normalized = true; - int _offset = 0; - - float _nms_threshold = 0.0f; - float _confidence_threshold = 0.0f; - float _objectness_score = 0.0f; - - int _num = 0; - int _num_loc_classes = 0; - int _num_priors = 0; - bool _priors_batches = false; - - enum CodeType { - CORNER = 1, - CENTER_SIZE = 2, - }; - - void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data, - float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size, - bool decodeType = true); // after ARM = false - - void nms_cf(const float *conf_data, const float *bboxes, const float *sizes, - int *buffer, int *indices, int &detections, int num_priors_actual); - - void nms_mx(const float *conf_data, const float *bboxes, const float *sizes, - int *buffer, int *indices, int *detections, int num_priors_actual); - - std::vector _decoded_bboxes; - std::vector _buffer; - std::vector _indices; - std::vector _detections_count; - std::vector _reordered_conf; - std::vector _bbox_sizes; - std::vector _num_priors_actual; -}; - -struct ConfidenceComparator { - explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} - - bool operator()(int idx1, int idx2) { - if (_conf_data[idx1] > _conf_data[idx2]) return true; - if (_conf_data[idx1] < _conf_data[idx2]) return false; - return idx1 < idx2; - } - - const float* _conf_data; -}; - -static inline float JaccardOverlap(const float *decoded_bbox, - const float *bbox_sizes, - const int idx1, - const int idx2) { - float xmin1 = decoded_bbox[idx1*4 + 0]; - float ymin1 = decoded_bbox[idx1*4 + 1]; - float xmax1 = decoded_bbox[idx1*4 + 2]; - float ymax1 = decoded_bbox[idx1*4 + 3]; - - float xmin2 = decoded_bbox[idx2*4 + 0]; - float ymin2 = decoded_bbox[idx2*4 + 1]; - float xmax2 = decoded_bbox[idx2*4 + 2]; - float ymax2 = decoded_bbox[idx2*4 + 3]; - - if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { - return 0.0f; - } - - float intersect_xmin = (std::max)(xmin1, xmin2); - float intersect_ymin = (std::max)(ymin1, ymin2); - float intersect_xmax = (std::min)(xmax1, xmax2); - float intersect_ymax = (std::min)(ymax1, ymax2); - - float intersect_width = intersect_xmax - intersect_xmin; - float intersect_height = intersect_ymax - intersect_ymin; - - if (intersect_width <= 0 || intersect_height <= 0) { - return 0.0f; - } - - float intersect_size = intersect_width * intersect_height; - float bbox1_size = bbox_sizes[idx1]; - float bbox2_size = bbox_sizes[idx2]; - - return intersect_size / (bbox1_size + bbox2_size - intersect_size); -} - -void DetectionOutputImpl::decodeBBoxes(const float *prior_data, - const float *loc_data, - const float *variance_data, - float *decoded_bboxes, - float *decoded_bbox_sizes, - int* num_priors_actual, - int n, - const int& offs, - const int& pr_size, - bool decodeType) { - num_priors_actual[n] = _num_priors; - if (!_normalized && decodeType) { - int num = 0; - for (; num < _num_priors; ++num) { - float batch_id = prior_data[num * pr_size + 0]; - if (batch_id == -1.f) { - num_priors_actual[n] = num; - break; - } - } - } - parallel_for(num_priors_actual[n], [&](int p) { - float new_xmin = 0.0f; - float new_ymin = 0.0f; - float new_xmax = 0.0f; - float new_ymax = 0.0f; - - float prior_xmin = prior_data[p*pr_size + 0 + offs]; - float prior_ymin = prior_data[p*pr_size + 1 + offs]; - float prior_xmax = prior_data[p*pr_size + 2 + offs]; - float prior_ymax = prior_data[p*pr_size + 3 + offs]; - - float loc_xmin = loc_data[4*p*_num_loc_classes + 0]; - float loc_ymin = loc_data[4*p*_num_loc_classes + 1]; - float loc_xmax = loc_data[4*p*_num_loc_classes + 2]; - float loc_ymax = loc_data[4*p*_num_loc_classes + 3]; - - if (!_normalized) { - prior_xmin /= _image_width; - prior_ymin /= _image_height; - prior_xmax /= _image_width; - prior_ymax /= _image_height; - } - - if (_code_type == CodeType::CORNER) { - if (_variance_encoded_in_target) { - // variance is encoded in target, we simply need to add the offset predictions. - new_xmin = prior_xmin + loc_xmin; - new_ymin = prior_ymin + loc_ymin; - new_xmax = prior_xmax + loc_xmax; - new_ymax = prior_ymax + loc_ymax; - } else { - new_xmin = prior_xmin + variance_data[p*4 + 0] * loc_xmin; - new_ymin = prior_ymin + variance_data[p*4 + 1] * loc_ymin; - new_xmax = prior_xmax + variance_data[p*4 + 2] * loc_xmax; - new_ymax = prior_ymax + variance_data[p*4 + 3] * loc_ymax; - } - } else if (_code_type == CodeType::CENTER_SIZE) { - float prior_width = prior_xmax - prior_xmin; - float prior_height = prior_ymax - prior_ymin; - float prior_center_x = (prior_xmin + prior_xmax) / 2.0f; - float prior_center_y = (prior_ymin + prior_ymax) / 2.0f; - - float decode_bbox_center_x, decode_bbox_center_y; - float decode_bbox_width, decode_bbox_height; - - if (_variance_encoded_in_target) { - // variance is encoded in target, we simply need to restore the offset predictions. - decode_bbox_center_x = loc_xmin * prior_width + prior_center_x; - decode_bbox_center_y = loc_ymin * prior_height + prior_center_y; - decode_bbox_width = std::exp(loc_xmax) * prior_width; - decode_bbox_height = std::exp(loc_ymax) * prior_height; - } else { - // variance is encoded in bbox, we need to scale the offset accordingly. - decode_bbox_center_x = variance_data[p*4 + 0] * loc_xmin * prior_width + prior_center_x; - decode_bbox_center_y = variance_data[p*4 + 1] * loc_ymin * prior_height + prior_center_y; - decode_bbox_width = std::exp(variance_data[p*4 + 2] * loc_xmax) * prior_width; - decode_bbox_height = std::exp(variance_data[p*4 + 3] * loc_ymax) * prior_height; - } - - new_xmin = decode_bbox_center_x - decode_bbox_width / 2.0f; - new_ymin = decode_bbox_center_y - decode_bbox_height / 2.0f; - new_xmax = decode_bbox_center_x + decode_bbox_width / 2.0f; - new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f; - } - - if (_clip_before_nms) { - new_xmin = (std::max)(0.0f, (std::min)(1.0f, new_xmin)); - new_ymin = (std::max)(0.0f, (std::min)(1.0f, new_ymin)); - new_xmax = (std::max)(0.0f, (std::min)(1.0f, new_xmax)); - new_ymax = (std::max)(0.0f, (std::min)(1.0f, new_ymax)); - } - - decoded_bboxes[p*4 + 0] = new_xmin; - decoded_bboxes[p*4 + 1] = new_ymin; - decoded_bboxes[p*4 + 2] = new_xmax; - decoded_bboxes[p*4 + 3] = new_ymax; - - decoded_bbox_sizes[p] = (new_xmax - new_xmin) * (new_ymax - new_ymin); - }); -} - -void DetectionOutputImpl::nms_cf(const float* conf_data, - const float* bboxes, - const float* sizes, - int* buffer, - int* indices, - int& detections, - int num_priors_actual) { - int count = 0; - for (int i = 0; i < num_priors_actual; ++i) { - if (conf_data[i] > _confidence_threshold) { - indices[count] = i; - count++; - } - } - - int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); - - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, - ConfidenceComparator(conf_data)); - - for (int i = 0; i < num_output_scores; ++i) { - const int idx = buffer[i]; - - bool keep = true; - for (int k = 0; k < detections; ++k) { - const int kept_idx = indices[k]; - float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); - if (overlap > _nms_threshold) { - keep = false; - break; - } - } - if (keep) { - indices[detections] = idx; - detections++; - } - } -} - -void DetectionOutputImpl::nms_mx(const float* conf_data, - const float* bboxes, - const float* sizes, - int* buffer, - int* indices, - int* detections, - int num_priors_actual) { - int count = 0; - for (int i = 0; i < num_priors_actual; ++i) { - float conf = -1; - int id = 0; - for (int c = 1; c < _num_classes; ++c) { - float temp = conf_data[c*_num_priors + i]; - if (temp > conf) { - conf = temp; - id = c; - } - } - - if (id > 0 && conf >= _confidence_threshold) { - indices[count++] = id*_num_priors + i; - } - } - - int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); - - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, - ConfidenceComparator(conf_data)); - - for (int i = 0; i < num_output_scores; ++i) { - const int idx = buffer[i]; - const int cls = idx/_num_priors; - const int prior = idx%_num_priors; - - int &ndetection = detections[cls]; - int *pindices = indices + cls*_num_priors; - - bool keep = true; - for (int k = 0; k < ndetection; ++k) { - const int kept_idx = pindices[k]; - float overlap = 0.0f; - if (_share_location) { - overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx); - } else { - overlap = JaccardOverlap(bboxes, sizes, cls*_num_priors + prior, cls*_num_priors + kept_idx); - } - if (overlap > _nms_threshold) { - keep = false; - break; - } - } - if (keep) { - pindices[ndetection++] = prior; - } - } -} - -REG_FACTORY_FOR(DetectionOutputImpl, DetectionOutput); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp deleted file mode 100644 index fefcee872cea4f..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput_onnx.cpp +++ /dev/null @@ -1,402 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" -#include "common/tensor_desc_creator.h" -#include - - -namespace { -struct Indexer { - const std::vector dims_; - int total_{1}; - - explicit Indexer(const std::vector& dims) : dims_(dims) { - total_ = 1; - for (size_t i = 0; i < dims_.size(); ++i) { - total_ *= dims_[i]; - } - } - - int operator()(const std::vector& idx) const { - int flat_idx = 0; - assert(idx.size() == dims_.size()); - for (size_t i = 0; i < dims_.size(); ++i) { - assert(0 <= idx[i] && idx[i] < dims_[i]); - flat_idx = flat_idx * dims_[i] + idx[i]; - } - assert(flat_idx < total_); - return flat_idx; - } -}; -} // namespace - - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -static -void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores, - float* refined_boxes, float* refined_boxes_areas, float* refined_scores, - const int rois_num, const int classes_num, - const float img_H, const float img_W, - const float max_delta_log_wh, - float coordinates_offset) { - Indexer box_idx({rois_num, 4}); - Indexer delta_idx({rois_num, classes_num, 4}); - Indexer score_idx({rois_num, classes_num}); - - Indexer refined_box_idx({classes_num, rois_num, 4}); - Indexer refined_score_idx({classes_num, rois_num}); - - for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) { - float x0 = boxes[box_idx({roi_idx, 0})]; - float y0 = boxes[box_idx({roi_idx, 1})]; - float x1 = boxes[box_idx({roi_idx, 2})]; - float y1 = boxes[box_idx({roi_idx, 3})]; - - if (x1 - x0 <= 0 || y1 - y0 <= 0) { - continue; - } - - // width & height of box - const float ww = x1 - x0 + coordinates_offset; - const float hh = y1 - y0 + coordinates_offset; - // center location of box - const float ctr_x = x0 + 0.5f * ww; - const float ctr_y = y0 + 0.5f * hh; - - for (int class_idx = 1; class_idx < classes_num; ++class_idx) { - const float dx = deltas[delta_idx({roi_idx, class_idx, 0})] / weights[0]; - const float dy = deltas[delta_idx({roi_idx, class_idx, 1})] / weights[1]; - const float d_log_w = deltas[delta_idx({roi_idx, class_idx, 2})] / weights[2]; - const float d_log_h = deltas[delta_idx({roi_idx, class_idx, 3})] / weights[3]; - - // new center location according to deltas (dx, dy) - const float pred_ctr_x = dx * ww + ctr_x; - const float pred_ctr_y = dy * hh + ctr_y; - // new width & height according to deltas d(log w), d(log h) - const float pred_w = std::exp((std::min)(d_log_w, max_delta_log_wh)) * ww; - const float pred_h = std::exp((std::min)(d_log_h, max_delta_log_wh)) * hh; - - // update upper-left corner location - float x0_new = pred_ctr_x - 0.5f * pred_w; - float y0_new = pred_ctr_y - 0.5f * pred_h; - // update lower-right corner location - float x1_new = pred_ctr_x + 0.5f * pred_w - coordinates_offset; - float y1_new = pred_ctr_y + 0.5f * pred_h - coordinates_offset; - - // adjust new corner locations to be within the image region, - x0_new = std::max(0.0f, x0_new); - y0_new = std::max(0.0f, y0_new); - x1_new = std::max(0.0f, x1_new); - y1_new = std::max(0.0f, y1_new); - - // recompute new width & height - const float box_w = x1_new - x0_new + coordinates_offset; - const float box_h = y1_new - y0_new + coordinates_offset; - - refined_boxes[refined_box_idx({class_idx, roi_idx, 0})] = x0_new; - refined_boxes[refined_box_idx({class_idx, roi_idx, 1})] = y0_new; - refined_boxes[refined_box_idx({class_idx, roi_idx, 2})] = x1_new; - refined_boxes[refined_box_idx({class_idx, roi_idx, 3})] = y1_new; - - refined_boxes_areas[refined_score_idx({class_idx, roi_idx})] = box_w * box_h; - - refined_scores[refined_score_idx({class_idx, roi_idx})] = scores[score_idx({roi_idx, class_idx})]; - } - } -} - -template -static bool SortScorePairDescend(const std::pair& pair1, - const std::pair& pair2) { - return pair1.first > pair2.first; -} - - -struct ConfidenceComparator { - explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} - - bool operator()(int idx1, int idx2) { - if (_conf_data[idx1] > _conf_data[idx2]) return true; - if (_conf_data[idx1] < _conf_data[idx2]) return false; - return idx1 < idx2; - } - - const float* _conf_data; -}; - -static inline float JaccardOverlap(const float *decoded_bbox, - const float *bbox_sizes, - const int idx1, - const int idx2, - const float coordinates_offset = 1) { - float xmin1 = decoded_bbox[idx1 * 4 + 0]; - float ymin1 = decoded_bbox[idx1 * 4 + 1]; - float xmax1 = decoded_bbox[idx1 * 4 + 2]; - float ymax1 = decoded_bbox[idx1 * 4 + 3]; - - float xmin2 = decoded_bbox[idx2 * 4 + 0]; - float ymin2 = decoded_bbox[idx2 * 4 + 1]; - float ymax2 = decoded_bbox[idx2 * 4 + 3]; - float xmax2 = decoded_bbox[idx2 * 4 + 2]; - - if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { - return 0.0f; - } - - float intersect_xmin = (std::max)(xmin1, xmin2); - float intersect_ymin = (std::max)(ymin1, ymin2); - float intersect_xmax = (std::min)(xmax1, xmax2); - float intersect_ymax = (std::min)(ymax1, ymax2); - - float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; - float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset; - - if (intersect_width <= 0 || intersect_height <= 0) { - return 0.0f; - } - - float intersect_size = intersect_width * intersect_height; - float bbox1_size = bbox_sizes[idx1]; - float bbox2_size = bbox_sizes[idx2]; - - return intersect_size / (bbox1_size + bbox2_size - intersect_size); -} - - -static void nms_cf(const float* conf_data, - const float* bboxes, - const float* sizes, - int* buffer, - int* indices, - int& detections, - const int boxes_num, - const int pre_nms_topn, - const int post_nms_topn, - const float confidence_threshold, - const float nms_threshold) { - int count = 0; - for (int i = 0; i < boxes_num; ++i) { - if (conf_data[i] > confidence_threshold) { - indices[count] = i; - count++; - } - } - - int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count)); - - std::partial_sort_copy(indices, indices + count, - buffer, buffer + num_output_scores, - ConfidenceComparator(conf_data)); - - detections = 0; - for (int i = 0; i < num_output_scores; ++i) { - const int idx = buffer[i]; - - bool keep = true; - for (int k = 0; k < detections; ++k) { - const int kept_idx = indices[k]; - float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); - if (overlap > nms_threshold) { - keep = false; - break; - } - } - if (keep) { - indices[detections] = idx; - detections++; - } - } - - detections = (post_nms_topn == -1 ? detections : (std::min)(post_nms_topn, detections)); -} - - -class ExperimentalDetectronDetectionOutputImpl: public ExtLayerBase { -private: - const int INPUT_ROIS {0}; - const int INPUT_DELTAS {1}; - const int INPUT_SCORES {2}; - const int INPUT_IM_INFO {3}; - - const int OUTPUT_BOXES {0}; - const int OUTPUT_CLASSES {1}; - const int OUTPUT_SCORES {2}; - -public: - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto doOp = ngraph::as_type_ptr(op); - if (!doOp) { - errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; - return false; - } - } catch (...) { - return false; - } - return true; - } - - explicit ExperimentalDetectronDetectionOutputImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - auto doOp = ngraph::as_type_ptr(op); - auto attributes = doOp->get_attrs(); - - score_threshold_ = attributes.score_threshold; - nms_threshold_ = attributes.nms_threshold; - max_delta_log_wh_ = attributes.max_delta_log_wh; - classes_num_ = attributes.num_classes; - max_detections_per_class_ = attributes.post_nms_count; - max_detections_per_image_ = attributes.max_detections_per_image; - class_agnostic_box_regression_ = attributes.class_agnostic_box_regression; - deltas_weights_ = attributes.deltas_weights; - - std::vector inDataConfigurators(op->get_input_size(), {TensorDescCreatorTypes::ncsp, Precision::FP32}); - - addConfig(op, inDataConfigurators, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}, - {TensorDescCreatorTypes::ncsp, Precision::I32}, - {TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - const int rois_num = inputs[INPUT_ROIS]->getTensorDesc().getDims()[0]; - assert(classes_num_ == static_cast(inputs[INPUT_SCORES]->getTensorDesc().getDims()[1])); - assert(4 * classes_num_ == static_cast(inputs[INPUT_DELTAS]->getTensorDesc().getDims()[1])); - - const auto* boxes = inputs[INPUT_ROIS]->buffer().as(); - const auto* deltas = inputs[INPUT_DELTAS]->buffer().as(); - const auto* scores = inputs[INPUT_SCORES]->buffer().as(); - const auto* im_info = inputs[INPUT_IM_INFO]->buffer().as(); - - auto* output_boxes = outputs[OUTPUT_BOXES]->buffer().as(); - auto* output_scores = outputs[OUTPUT_SCORES]->buffer().as(); - auto* output_classes = outputs[OUTPUT_CLASSES]->buffer().as(); - - const float img_H = im_info[0]; - const float img_W = im_info[1]; - - // Apply deltas. - std::vector refined_boxes(classes_num_ * rois_num * 4, 0); - std::vector refined_scores(classes_num_ * rois_num, 0); - std::vector refined_boxes_areas(classes_num_ * rois_num, 0); - Indexer refined_box_idx({classes_num_, rois_num, 4}); - Indexer refined_score_idx({classes_num_, rois_num}); - - refine_boxes(boxes, deltas, &deltas_weights_[0], scores, - &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0], - rois_num, classes_num_, - img_H, img_W, - max_delta_log_wh_, - 1.0f); - - // Apply NMS class-wise. - std::vector buffer(rois_num, 0); - std::vector indices(classes_num_ * rois_num, 0); - std::vector detections_per_class(classes_num_, 0); - int total_detections_num = 0; - - for (int class_idx = 1; class_idx < classes_num_; ++class_idx) { - nms_cf(&refined_scores[refined_score_idx({class_idx, 0})], - &refined_boxes[refined_box_idx({class_idx, 0, 0})], - &refined_boxes_areas[refined_score_idx({class_idx, 0})], - &buffer[0], - &indices[total_detections_num], - detections_per_class[class_idx], - rois_num, - -1, - max_detections_per_class_, - score_threshold_, - nms_threshold_); - total_detections_num += detections_per_class[class_idx]; - } - - // Leave only max_detections_per_image_ detections. - // confidence, - std::vector>> conf_index_class_map; - - int indices_offset = 0; - for (int c = 0; c < classes_num_; ++c) { - int n = detections_per_class[c]; - for (int i = 0; i < n; ++i) { - int idx = indices[indices_offset + i]; - float score = refined_scores[refined_score_idx({c, idx})]; - conf_index_class_map.push_back(std::make_pair(score, std::make_pair(c, idx))); - } - indices_offset += n; - } - - assert(max_detections_per_image_ > 0); - if (total_detections_num > max_detections_per_image_) { - std::partial_sort(conf_index_class_map.begin(), - conf_index_class_map.begin() + max_detections_per_image_, - conf_index_class_map.end(), - SortScorePairDescend>); - conf_index_class_map.resize(max_detections_per_image_); - total_detections_num = max_detections_per_image_; - } - - // Fill outputs. - memset(output_boxes, 0, max_detections_per_image_ * 4 * sizeof(output_boxes[0])); - memset(output_scores, 0, max_detections_per_image_ * sizeof(output_scores[0])); - memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0])); - - int i = 0; - for (const auto & detection : conf_index_class_map) { - float score = detection.first; - int cls = detection.second.first; - int idx = detection.second.second; - output_boxes[4 * i + 0] = refined_boxes[refined_box_idx({cls, idx, 0})]; - output_boxes[4 * i + 1] = refined_boxes[refined_box_idx({cls, idx, 1})]; - output_boxes[4 * i + 2] = refined_boxes[refined_box_idx({cls, idx, 2})]; - output_boxes[4 * i + 3] = refined_boxes[refined_box_idx({cls, idx, 3})]; - output_scores[i] = score; - output_classes[i] = cls; - ++i; - } - - return OK; - } - -private: - float score_threshold_; - float nms_threshold_; - float max_delta_log_wh_; - int classes_num_; - int max_detections_per_class_; - int max_detections_per_image_; - bool class_agnostic_box_regression_; - std::vector deltas_weights_; -}; - - - -REG_FACTORY_FOR(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp b/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp deleted file mode 100644 index 4ea74721adca49..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/gather_tree.cpp +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -using MKLDNNPlugin::TensorDescCreatorTypes; - -class GatherTreeImpl: public ExtLayerBase { -public: - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - auto gatherElementsOp = ngraph::as_type_ptr(op); - if (!gatherElementsOp) { - errorMessage = "Node is not an instance of the GatherTree operation from operation set v1."; - return false; - } - - auto precision = op->get_input_element_type(GATHER_TREE_STEP_IDX); - if (!MKLDNNPlugin::one_of(precision, ngraph::element::f32, ngraph::element::i32)) - precision = ngraph::element::f32; - if (op->get_input_element_type(GATHER_TREE_PARENT_IDX) != precision || - op->get_input_element_type(GATHER_TREE_MAX_SEQ_LEN) != precision || - op->get_input_element_type(GATHER_TREE_END_TOKEN) != precision || - op->get_output_element_type(0) != precision) { - errorMessage = "Node has incorrect input/output data precision. Must be the same."; - return false; - } - } catch (...) { - return false; - } - - return true; - } - - explicit GatherTreeImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - std::string errorPrefix = std::string("Node GatherTree with name '") + op->get_friendly_name() + "'"; - if (op->get_input_size() != 4) - IE_THROW() << errorPrefix << " has incorrect number of input edges."; - if (op->get_output_size() != 1) - IE_THROW() << errorPrefix << " has incorrect number of output edges."; - - precision = details::convertPrecision(op->get_input_element_type(GATHER_TREE_STEP_IDX)); - if (!MKLDNNPlugin::one_of(precision, Precision::FP32, Precision::I32)) - precision = Precision::FP32; - - if (op->get_input_shape(GATHER_TREE_STEP_IDX).size() != 3) - IE_THROW() << errorPrefix << " step_idx vector should be 3 dimension"; - if (op->get_input_shape(GATHER_TREE_PARENT_IDX).size() != 3) - IE_THROW() << errorPrefix << " parent_idx vector should be 3 dimension"; - if (op->get_input_shape(GATHER_TREE_MAX_SEQ_LEN).size() != 1) - IE_THROW() << errorPrefix << " max_seq_len vector should be 1 dimension"; - if (op->get_input_shape(GATHER_TREE_END_TOKEN).size() != 0) - IE_THROW() << errorPrefix << " end_token should be 1 dimension"; - - addConfig(op, {{TensorDescCreatorTypes::ncsp, precision}, - {TensorDescCreatorTypes::ncsp, precision}, - {TensorDescCreatorTypes::ncsp, precision}, - {TensorDescCreatorTypes::ncsp, precision}}, - {{TensorDescCreatorTypes::ncsp, precision}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - throw; - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - if (precision == Precision::FP32) - return execute_impl(inputs, outputs, resp); - else - return execute_impl(inputs, outputs, resp); - } - - template - StatusCode execute_impl(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept { - const auto *step_idx = inputs[GATHER_TREE_STEP_IDX]->cbuffer().as() + - inputs[GATHER_TREE_STEP_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const auto * const parent_idx = inputs[GATHER_TREE_PARENT_IDX]->cbuffer().as() + - inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const size_t parent_idx_size = inputs[GATHER_TREE_PARENT_IDX]->size() - - inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const auto *max_seq_len = inputs[GATHER_TREE_MAX_SEQ_LEN]->cbuffer().as() + - inputs[GATHER_TREE_MAX_SEQ_LEN]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - auto end_token = (inputs[GATHER_TREE_END_TOKEN]->cbuffer().as() + - inputs[GATHER_TREE_END_TOKEN]->getTensorDesc().getBlockingDesc().getOffsetPadding())[0]; - auto * final_idx = outputs[0]->cbuffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - SizeVector step_idx_dims = inputs[GATHER_TREE_STEP_IDX]->getTensorDesc().getDims(); - SizeVector parent_idx_dims = inputs[GATHER_TREE_PARENT_IDX]->getTensorDesc().getDims(); - SizeVector max_seq_len_dims = inputs[GATHER_TREE_MAX_SEQ_LEN]->getTensorDesc().getDims(); - SizeVector final_idx_dims = outputs[0]->getTensorDesc().getDims(); - int32_t max_time = step_idx_dims[0]; - const size_t batch_size = step_idx_dims[1]; - const size_t beam_width = step_idx_dims[2]; - const size_t bb_size = batch_size * beam_width; - - if (max_time != static_cast(parent_idx_dims[0]) || max_time != static_cast(final_idx_dims[0]) || - batch_size != parent_idx_dims[1] || batch_size != final_idx_dims[1] || batch_size != max_seq_len_dims[0] || - beam_width != parent_idx_dims[2] || beam_width != final_idx_dims[2]) { - if (resp) { - std::string errorMsg = "Input/Output tensors dimensions mismatch"; - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return PARAMETER_MISMATCH; - } - - bool incorrect_result = false; - parallel_for2d(batch_size, beam_width, [&](size_t batch, size_t beam) { - int32_t max_sequence_in_beam = std::min(max_time, static_cast(max_seq_len[batch])); - if (max_sequence_in_beam > 0) { - int32_t time, idx = (max_time - 1) * bb_size + batch * beam_width; - for (time = (max_time - 1); time >= max_sequence_in_beam; time--, idx -= bb_size) - final_idx[idx + beam] = end_token; - - for (int32_t parent = static_cast(beam); time >= 0; time--, idx -= bb_size) { - if (parent < 0 - || parent >= static_cast(beam_width) - || idx + parent >= parent_idx_size) { - incorrect_result = true; - break; - } - final_idx[idx + beam] = step_idx[idx + parent]; - parent = static_cast(parent_idx[idx + parent]); - } - - bool finished = false; - auto *final = &final_idx[batch * beam_width + beam]; - for (time = 0; time < max_sequence_in_beam; time++, final += bb_size) { - if (finished) - (*final) = end_token; - else if ((*final) == end_token) - finished = true; - } - } - }); - - if (incorrect_result) { - if (resp) { - std::string errorMsg = "Wrong parent index, result is incorrect"; - errorMsg.copy(resp->msg, sizeof(resp->msg) - 1); - } - return OUT_OF_BOUNDS; - } - - return OK; - } - -private: - static const size_t GATHER_TREE_STEP_IDX = 0; - static const size_t GATHER_TREE_PARENT_IDX = 1; - static const size_t GATHER_TREE_MAX_SEQ_LEN = 2; - static const size_t GATHER_TREE_END_TOKEN = 3; - - InferenceEngine::Precision precision; -}; - -REG_FACTORY_FOR(GatherTreeImpl, GatherTree); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp b/inference-engine/src/mkldnn_plugin/nodes/grn.cpp deleted file mode 100644 index 6ee077fd52ff1e..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/grn.cpp +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include "ie_parallel.hpp" -#include - -using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class GRNImpl: public ExtLayerBase { - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto grn = std::dynamic_pointer_cast(op); - if (!grn) { - errorMessage = "Only opset1 GRN operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - - std::string errorPrefix; - -public: - explicit GRNImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - errorPrefix = "GRN layer with name '" + op->get_friendly_name() + "'"; - const auto grn = std::dynamic_pointer_cast(op); - - if (op->get_input_size() != 1 || op->get_output_size() != 1) - IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; - - bias = grn->get_bias(); - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}, - {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, - ResponseDesc *resp) noexcept override { - float* src_data = inputs[0]->buffer(); - float* dst_data = outputs[0]->buffer(); - - SizeVector dims = inputs[0]->getTensorDesc().getDims(); - - int N = static_cast((dims.size() > 0) ? dims[0] : 1); - int C = static_cast((dims.size() > 1) ? dims[1] : 1); - int H = static_cast((dims.size() > 2) ? dims[2] : 1); - int W = static_cast((dims.size() > 3) ? dims[3] : 1); - - parallel_for3d(N, H, W, [&](int b, int h, int w) { - double variance = 0; - for (int c = 0; c < C; c++) { - variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2); - } - variance = std::pow(variance + bias, 0.5f); - for (int c = 0; c < C; c++) { - dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast(variance); - } - }); - return OK; - } - -private: - float bias = 1.0f; -}; - -REG_FACTORY_FOR(GRNImpl, GRN); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp index d06cefa7985ac2..d005c1e16b630d 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp @@ -7,24 +7,3 @@ # define MKLDNN_EXTENSION_NODE(__prim, __type) #endif -MKLDNN_EXTENSION_NODE(CTCLossImpl, CTCLoss); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronTopKROIsImpl, ExperimentalDetectronTopKROIs); -MKLDNN_EXTENSION_NODE(ExtractImagePatchesImpl, ExtractImagePatches); -MKLDNN_EXTENSION_NODE(ReverseSequenceImpl, ReverseSequence); -MKLDNN_EXTENSION_NODE(DetectionOutputImpl, DetectionOutput); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronDetectionOutputImpl, ExperimentalDetectronDetectionOutput); -MKLDNN_EXTENSION_NODE(LogSoftmaxImpl, LogSoftmax); -MKLDNN_EXTENSION_NODE(ReorgYoloImpl, ReorgYolo); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronROIFeatureExtractorImpl, ExperimentalDetectronROIFeatureExtractor); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronGenerateProposalsSingleImageImpl, ExperimentalDetectronGenerateProposalsSingleImage); -MKLDNN_EXTENSION_NODE(NonMaxSuppressionImpl, NonMaxSuppressionIEInternal); -MKLDNN_EXTENSION_NODE(TopKImpl, TopK); -MKLDNN_EXTENSION_NODE(ExperimentalDetectronPriorGridGeneratorImpl, ExperimentalDetectronPriorGridGenerator); -MKLDNN_EXTENSION_NODE(GRNImpl, GRN); -MKLDNN_EXTENSION_NODE(BucketizeImpl, Bucketize); -MKLDNN_EXTENSION_NODE(CTCGreedyDecoderImpl, CTCGreedyDecoder); -MKLDNN_EXTENSION_NODE(CTCGreedyDecoderSeqLenImpl, CTCGreedyDecoderSeqLen); -MKLDNN_EXTENSION_NODE(ProposalImpl, Proposal); -MKLDNN_EXTENSION_NODE(RangeImpl, Range); -MKLDNN_EXTENSION_NODE(GatherTreeImpl, GatherTree); -MKLDNN_EXTENSION_NODE(CumSumImpl, CumSum); diff --git a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp b/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp deleted file mode 100644 index 337549e3434be0..00000000000000 --- a/inference-engine/src/mkldnn_plugin/nodes/log_softmax.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "base.hpp" - -#include -#include -#include -#include -#include -#include -#include "ie_parallel.hpp" -#include - -using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { - -class LogSoftmaxImpl: public ExtLayerBase { - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { - try { - const auto logSoftMax = std::dynamic_pointer_cast(op); - if (!logSoftMax) { - errorMessage = "Only opset5 LogSoftmax operation is supported"; - return false; - } - } catch (...) { - return false; - } - return true; - } - -public: - explicit LogSoftmaxImpl(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } - - errorPrefix = "LogSoftmax layer with name '" + op->get_friendly_name() + "'"; - const auto logSoftMax = std::dynamic_pointer_cast(op); - - if (op->get_input_size() != 1 || op->get_output_size() != 1) - IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; - - SizeVector dims = op->get_input_shape(0); - if (!dims.size()) - dims = SizeVector(1, 1); - int axis = logSoftMax->get_axis(); - if (axis < 0) - axis += dims.size(); - - if (dims.size() < static_cast((size_t)(1) + axis)) - IE_THROW() << errorPrefix << " has incorrect input parameters dimensions and axis number!"; - - int j; - for (j = dims.size() - 1; j >= 0; j--) { - if (dims[j] != 1) break; - } - if (j == axis) is_last_dim = true; - - for (int i = 0; i < axis; i++) - axis_step *= dims[i]; - reduced_axis_size = dims[axis]; - for (size_t i = (axis + 1); i < dims.size(); i++) - reduced_axis_stride *= dims[i]; - - addConfig(op, {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, - {{TensorDescCreatorTypes::ncsp, Precision::FP32}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } - } - - StatusCode execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept override { - const float *src_data = inputs[0]->cbuffer().as() + - inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - float* dst_data = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - - if (is_last_dim) { - parallel_for(axis_step, [&](size_t i) { - const float *src_dataPtr = &src_data[i * reduced_axis_size]; - float *dst_dataPtr = &dst_data[i * reduced_axis_size]; - - float reduce_prod = 0.0f; - const float max = *std::max_element(src_dataPtr, src_dataPtr + reduced_axis_size); - for (size_t j = 0; j < reduced_axis_size; ++j) - reduce_prod += expf(src_dataPtr[j] - max); - - reduce_prod = logf(reduce_prod); - for (size_t j = 0; j < reduced_axis_size; ++j) - dst_dataPtr[j] = src_dataPtr[j] - max - reduce_prod; - }); - } else { - parallel_for2d(axis_step, reduced_axis_stride, [&](size_t k, size_t i) { - const float *src_dataPtr = &src_data[k * reduced_axis_stride * reduced_axis_size + i]; - float *dst_dataPtr = &dst_data[k * reduced_axis_stride * reduced_axis_size + i]; - - float reduce_prod = 0.0f; - float max = std::numeric_limits::min(); - for (size_t j = 0; j < reduced_axis_size; ++j) { - if (src_dataPtr[j * reduced_axis_stride] > max) - max = src_dataPtr[j * reduced_axis_stride]; - } - - for (size_t j = 0; j < reduced_axis_size; ++j) - reduce_prod += expf(src_dataPtr[j * reduced_axis_stride] - max); - - reduce_prod = logf(reduce_prod); - for (size_t j = 0; j < reduced_axis_size; ++j) - dst_dataPtr[j * reduced_axis_stride] = src_dataPtr[j * reduced_axis_stride] - max - reduce_prod; - }); - } - - return OK; - } - -private: - size_t reduced_axis_size; - size_t reduced_axis_stride = 1; - size_t axis_step = 1; - bool is_last_dim = false; - - std::string errorPrefix; -}; - -REG_FACTORY_FOR(LogSoftmaxImpl, LogSoftmax); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp new file mode 100644 index 00000000000000..c6c327a1993f3d --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.cpp @@ -0,0 +1,218 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_bucketize_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNBucketizeNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto bucketsize = std::dynamic_pointer_cast(op); + if (!bucketsize) { + errorMessage = "Only opset3 Bucketize operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNBucketizeNode::MKLDNNBucketizeNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "Bucketize layer with name '" + op->get_friendly_name() + "' "; + const auto bucketsize = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1) { + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + } + + // check one attribute + with_right = bucketsize->get_with_right_bound(); + + // check dimensions of input tensors + SizeVector input_tensor_dims = op->get_input_shape(INPUT_TENSOR_PORT); + if (input_tensor_dims.size() < 1) { + IE_THROW() << errorPrefix << " has incorrect dimensions of the input."; + } + SizeVector input_bin_dims = op->get_input_shape(INPUT_BINS_PORT); + if (input_bin_dims.size() != 1) { + IE_THROW() << errorPrefix << " has incorrect dimensions of the boundaries tensor."; + } + if (input_bin_dims[0] != 0) { + with_bins = true; + } + num_bin_values = input_bin_dims[0]; + + num_values = std::accumulate(input_tensor_dims.begin(), input_tensor_dims.end(), size_t(1), std::multiplies()); +} + +void MKLDNNBucketizeNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + // check precisions for input and output tensors + input_precision = getOriginalInputPrecisionAtPort(INPUT_TENSOR_PORT); + if (input_precision != Precision::FP32 && input_precision != Precision::I32 && + input_precision != Precision::I64) { + input_precision = Precision::FP32; + } + boundaries_precision = getOriginalInputPrecisionAtPort(INPUT_BINS_PORT); + if (boundaries_precision != Precision::FP32 && boundaries_precision != Precision::I32 && + boundaries_precision != Precision::I64) { + boundaries_precision = Precision::FP32; + } + output_precision = getOriginalOutputPrecisionAtPort(OUTPUT_TENSOR_PORT); + if (output_precision != Precision::I32 && output_precision != Precision::I64) { + output_precision = Precision::I32; + } + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, input_precision}, + {TensorDescCreatorTypes::ncsp, boundaries_precision}}, + {{TensorDescCreatorTypes::ncsp, output_precision}}, + impl_desc_type::ref_any); +} + +void MKLDNNBucketizeNode::execute(mkldnn::stream strm) { + auto precision_mask = getPrecisionMask(input_precision, boundaries_precision, output_precision); + + switch (precision_mask) { + case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::FP32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::FP32, Precision::I64, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::FP32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I64, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I32, Precision::I64, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::FP32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I32, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I32, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I64, Precision::I32): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + case getPrecisionMask(Precision::I64, Precision::I64, Precision::I64): + bucketize::value_type, + PrecisionTrait::value_type, + PrecisionTrait::value_type>(); + break; + default: + IE_THROW() << errorPrefix << " has unsupported precision: " << precision_mask; + } +} + +template +void MKLDNNBucketizeNode::bucketize() { + const auto *input_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const auto *boundaries_data = reinterpret_cast(getParentEdgeAt(1)->getMemoryPtr()->GetPtr()); + auto *output_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + if (!with_bins) { + memset(output_data, 0, num_values * sizeof(T_IND)); + return; + } + + // boundaries are assumed to be sorted and to have unique elements + parallel_for(num_values, [&](size_t ind) { + T value = input_data[ind]; + if (with_right) { + auto low = std::lower_bound(boundaries_data, boundaries_data + num_bin_values, value); + output_data[ind] = static_cast(low - boundaries_data); + } else { + auto up = std::upper_bound(boundaries_data, boundaries_data + num_bin_values, value); + output_data[ind] = static_cast(up - boundaries_data); + } + }); +} + +bool MKLDNNBucketizeNode::created() const { + return getType() == Bucketize; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNBucketizeNode, Bucketize) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h new file mode 100644 index 00000000000000..472e6aee3cfb03 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bucketize_node.h @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNBucketizeNode : public MKLDNNNode { +public: + MKLDNNBucketizeNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + template + void bucketize(); + + const size_t INPUT_TENSOR_PORT = 0; + const size_t INPUT_BINS_PORT = 1; + const size_t OUTPUT_TENSOR_PORT = 0; + + size_t num_values = 0; + size_t num_bin_values = 0; + bool with_right = false; + bool with_bins = false; + + InferenceEngine::Precision input_precision; + InferenceEngine::Precision boundaries_precision; + InferenceEngine::Precision output_precision; + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp index ba760cae535806..4990a658d61f1c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp @@ -33,7 +33,7 @@ namespace { bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto concatOp = ngraph::as_type_ptr(op); + const auto concatOp = ngraph::as_type_ptr(op); if (!concatOp) { errorMessage = "Node is not an instance of the Concat operation."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp index d226dd73890ec6..678922f3a4b5b1 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp @@ -42,6 +42,8 @@ MKLDNNConvertNode::MKLDNNConvertNode(const InferenceEngine::SizeVector &dims, co addOriginalInputPrecision(inPrc); outDims.emplace_back(dims); addOriginalOutputPrecision(outPrc); + + errorPrefix = "Convert node with name '" + getName() + "'"; } void MKLDNNConvertNode::getSupportedDescriptors() { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp new file mode 100644 index 00000000000000..34c9aaf191e697 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.cpp @@ -0,0 +1,167 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_ctc_greedy_decoder_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCTCGreedyDecoderNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto greedyDecOp = ngraph::as_type_ptr(op); + if (!greedyDecOp) { + errorMessage = "Node is not an instance of the CTCGreedyDecoder operation from operation set v0."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCTCGreedyDecoderNode::MKLDNNCTCGreedyDecoderNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "CTCGreedyDecoder layer with name '" + op->get_friendly_name() + "' "; + if (getOriginalInputsNumber() != 2) + IE_THROW() << errorPrefix << "has invalid number of input edges: " << getOriginalInputsNumber(); + if (getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << "has invalid number of outputs edges: " << getOriginalOutputsNumber(); + + if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0] && + op->get_input_shape(DATA_INDEX)[1] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[1]) + IE_THROW() << errorPrefix << "has invalid input shapes."; + + auto greedyDecOp = ngraph::as_type_ptr(op); + mergeRepeated = greedyDecOp->get_ctc_merge_repeated(); +} + +void MKLDNNCTCGreedyDecoderNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + Precision inDataPrecision = getOriginalInputPrecisionAtPort(DATA_INDEX); + if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) + IE_THROW() << errorPrefix << "has unsupported 'data' input precision: " << inDataPrecision; + + Precision seqLenPrecision = getOriginalInputPrecisionAtPort(SEQUENCE_LENGTH_INDEX); + if (seqLenPrecision != Precision::FP32 && seqLenPrecision != Precision::BF16) + IE_THROW() << errorPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNCTCGreedyDecoderNode::execute(mkldnn::stream strm) { + const float* probabilities = reinterpret_cast(getParentEdgeAt(DATA_INDEX)->getMemoryPtr()->GetPtr()); + const float* sequenceMask = reinterpret_cast(getParentEdgeAt(SEQUENCE_LENGTH_INDEX)->getMemoryPtr()->GetPtr()); + float* outputSequences = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + const size_t T = getParentEdgeAt(DATA_INDEX)->getDims()[0]; + const size_t B = getParentEdgeAt(DATA_INDEX)->getDims()[1]; + const int C = getParentEdgeAt(DATA_INDEX)->getDims()[2]; + const size_t BC = B * C; + const size_t CB1 = C * (B - 1); + + const int blankIndex = C - 1; + + std::vector sequenceLengths(B, 0); + parallel_for(B, [&](size_t b) { + size_t t = 0; + for (; t < T; t++) { + if (sequenceMask[B * t + b] == 0.f) + break; + } + sequenceLengths[b] = t; + }); + + size_t workAmount = 0; + for (size_t b = 0; b < B; b++) { + workAmount += sequenceLengths[b]; + } + + // Parallelization could not be made directly by T due to output index depends on merged classes and + // blank index, thus could not be shared between threads. Better to divide operation on two steps. + // At the first stage find the maximum index. At second stage merge if needed. + // Such approach makes parallelization more efficient. + auto threadBody = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(workAmount, nthr, ithr, start, end); + if (start >= end) + return; + size_t tStart = 0lu, bStart = 0lu; + for (; bStart < B; bStart++) { + tStart += sequenceLengths[bStart]; + if (tStart >= start) { + tStart = start - (tStart - sequenceLengths[bStart]); + break; + } + } + + size_t workCounter = start; + + for (size_t b = bStart; b < B; ++b) { + size_t outputIndex = b * T + tStart; + const float* probs = probabilities + b * C + BC * tStart; + size_t sequenceLength = sequenceLengths[b]; + + for (size_t t = tStart; t < sequenceLength; ++t) { + int maxClassIdx = 0; + + float maxProb = probs[0]; + ++probs; + + for (int c = 1; c < C; ++c, ++probs) { + if (*probs > maxProb) { + maxClassIdx = c; + maxProb = *probs; + } + } + probs += CB1; + outputSequences[outputIndex++] = static_cast(maxClassIdx); + + if (++workCounter >= end) { + return; + } + } + tStart = 0lu; + } + }; // thread body + + parallel_nt(0, threadBody); + + parallel_for(B, [&](size_t b) { + int prevClassIdx = -1; + size_t outputIndex = b * T; + const size_t sequenceLength = sequenceLengths[b]; + float* shiftedOut = outputSequences + b * T; + for (size_t t = 0; t < sequenceLength; ++t) { + if (*shiftedOut < blankIndex && + !(mergeRepeated && *shiftedOut == prevClassIdx)) { + outputSequences[outputIndex++] = *shiftedOut; + } + prevClassIdx = *shiftedOut; + shiftedOut++; + } + std::fill(outputSequences + outputIndex, outputSequences + (b + 1) * T, -1.f); + }); +} + +bool MKLDNNCTCGreedyDecoderNode::created() const { + return getType() == CTCGreedyDecoder; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCTCGreedyDecoderNode, CTCGreedyDecoder) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h new file mode 100644 index 00000000000000..26554ae7333dca --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_node.h @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCTCGreedyDecoderNode : public MKLDNNNode { +public: + MKLDNNCTCGreedyDecoderNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t DATA_INDEX = 0lu; + const size_t SEQUENCE_LENGTH_INDEX = 1lu; + bool mergeRepeated; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp new file mode 100644 index 00000000000000..0eccdbfa1b5b07 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_ctc_greedy_decoder_seq_len_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCTCGreedyDecoderSeqLenNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto greedyDecOp = ngraph::as_type_ptr(op); + if (!greedyDecOp) { + errorMessage = "Node is not an instance of the CTCGreedyDecoderSeqLen operation from operation set v6."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCTCGreedyDecoderSeqLenNode::MKLDNNCTCGreedyDecoderSeqLenNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "CTCGreedyDecoderSeqLen layer with name '" + op->get_friendly_name() + "' "; + if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 3) + IE_THROW() << errorPrefix << "has invalid number of input edges: " << getOriginalInputsNumber(); + if (getOriginalOutputsNumber() != 2) + IE_THROW() << errorPrefix << "has invalid number of outputs edges: " << getOriginalOutputsNumber(); + + if (op->get_input_shape(DATA_INDEX)[0] != op->get_input_shape(SEQUENCE_LENGTH_INDEX)[0]) + IE_THROW() << errorPrefix << "has invalid input shapes."; + + auto greedyDecOp = ngraph::as_type_ptr(op); + mergeRepeated = greedyDecOp->get_merge_repeated(); +} + +void MKLDNNCTCGreedyDecoderSeqLenNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + Precision inDataPrecision = getOriginalInputPrecisionAtPort(DATA_INDEX); + if (inDataPrecision != Precision::FP32 && inDataPrecision != Precision::BF16) + IE_THROW() << errorPrefix << "has unsupported 'data' input precision: " << inDataPrecision; + + Precision seqLenPrecision = getOriginalInputPrecisionAtPort(SEQUENCE_LENGTH_INDEX); + if (seqLenPrecision != Precision::I32 && seqLenPrecision != Precision::I64) + IE_THROW() << errorPrefix << "has unsupported 'sequence_length' input precision: " << seqLenPrecision; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + for (int i = 1; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::I32}, + {TensorDescCreatorTypes::ncsp, Precision::I32}}, + impl_desc_type::ref_any); +} + +void MKLDNNCTCGreedyDecoderSeqLenNode::execute(mkldnn::stream strm) { + const float* probabilities = reinterpret_cast(getParentEdgeAt(DATA_INDEX)->getMemoryPtr()->GetPtr()); + const int* sequenceLengths = reinterpret_cast(getParentEdgeAt(SEQUENCE_LENGTH_INDEX)->getMemoryPtr()->GetPtr()); + int* decodedClasses = reinterpret_cast(getChildEdgesAtPort(DECODED_CLASSES_INDEX)[0]->getMemoryPtr()->GetPtr()); + int* decodedClassesLength = reinterpret_cast(getChildEdgesAtPort(DECODED_CLASSES_LENGTH_INDEX)[0]->getMemoryPtr()->GetPtr()); + + const size_t B = getParentEdgeAt(DATA_INDEX)->getDims()[0];; + const size_t T = getParentEdgeAt(DATA_INDEX)->getDims()[1];; + const int C = getParentEdgeAt(DATA_INDEX)->getDims()[2];; + const size_t TC = T * C; + + int blankIndex = C - 1; + if (inDims.size() > BLANK_INDEX) + blankIndex = (reinterpret_cast(getParentEdgeAt(BLANK_INDEX)->getMemoryPtr()->GetPtr()))[0]; + + size_t workAmount = 0; + for (size_t b = 0; b < B; b++) { + if (sequenceLengths[b] > T) { + std::string errorMsg = errorPrefix + + ". Sequence length " + std::to_string(sequenceLengths[b]) + + " cannot be greater than according decoded classes dimension size " + + std::to_string(getChildEdgesAtPort(DECODED_CLASSES_INDEX)[0]->getDims()[1]); + IE_THROW() << errorMsg; + } + workAmount += sequenceLengths[b]; + } + // Parallelization could not be made directly by T due to output index depends on merged classes and + // blank index, thus could not be shared between threads. Better to divide operation on two steps. + // At the first stage find the maximum index. At second stage merge if needed. + // Such approach makes parallelization more efficient. + auto threadBody = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(workAmount, nthr, ithr, start, end); + if (start >= end) + return; + size_t tStart = 0lu, bStart = 0lu; + for (; bStart < B; bStart++) { + tStart += sequenceLengths[bStart]; + if (tStart >= start) { + tStart = start - (tStart - sequenceLengths[bStart]); + break; + } + } + + size_t workCounter = start; + + for (size_t b = bStart; b < B; ++b) { + size_t outputIndex = b * T + tStart; + const float* probs = probabilities + b * TC + C * tStart; + const size_t actualSeqLen = sequenceLengths[b]; + + for (size_t t = tStart; t < actualSeqLen; ++t) { + int maxClassIdx = 0; + float maxProb = probs[0]; + probs++; + + for (int c = 1; c < C; c++, probs++) { + if (*probs > maxProb) { + maxClassIdx = c; + maxProb = *probs; + } + } + decodedClasses[outputIndex++] = maxClassIdx; + + if (++workCounter >= end) { + return; + } + } + tStart = 0lu; + } + }; // thread body + + parallel_nt(0, threadBody); + + parallel_for(B, [&](size_t b) { + int prevClassIdx = -1; + size_t outputIndex = b * T; + const size_t actualSeqLen = sequenceLengths[b]; + int* shiftedOut = decodedClasses + b * T; + + for (size_t t = 0; t < actualSeqLen; ++t) { + if (*shiftedOut != blankIndex && + !(mergeRepeated && *shiftedOut == prevClassIdx)) { + decodedClasses[outputIndex++] = *shiftedOut; + } + prevClassIdx = *shiftedOut; + shiftedOut++; + } + std::fill(decodedClasses + outputIndex, decodedClasses + (b + 1) * T, -1); + decodedClassesLength[b] = outputIndex - b * T; + }); +} + +bool MKLDNNCTCGreedyDecoderSeqLenNode::created() const { + return getType() == CTCGreedyDecoderSeqLen; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCTCGreedyDecoderSeqLenNode, CTCGreedyDecoderSeqLen) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h new file mode 100644 index 00000000000000..b1d5ab6d9ffef3 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_greedy_decoder_seq_len_node.h @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCTCGreedyDecoderSeqLenNode : public MKLDNNNode { +public: + MKLDNNCTCGreedyDecoderSeqLenNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t DATA_INDEX = 0lu; + const size_t SEQUENCE_LENGTH_INDEX = 1lu; + const size_t BLANK_INDEX = 2lu; + const size_t DECODED_CLASSES_INDEX = 0lu; + const size_t DECODED_CLASSES_LENGTH_INDEX = 1lu; + bool mergeRepeated; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp new file mode 100644 index 00000000000000..b355dcaefcd4b0 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.cpp @@ -0,0 +1,279 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_ctc_loss_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCTCLossNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto ctcLossOp = ngraph::as_type_ptr(op); + if (!ctcLossOp) { + errorMessage = "Node is not an instance of the CTCLoss operation from operation set v4."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCTCLossNode::MKLDNNCTCLossNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = std::string("CTCLoss layer with name '") + op->get_friendly_name() + "'"; + + if (getOriginalInputsNumber() != 4 && getOriginalInputsNumber() != 5) + IE_THROW() << errorPrefix << " has invalid inputs number."; + + auto ctcLossOp = ngraph::as_type_ptr(op); + ctcMergeRepeated = ctcLossOp->get_ctc_merge_repeated(); + preprocessCollapseRepeated = ctcLossOp->get_preprocess_collapse_repeated(); + unique = ctcLossOp->get_unique(); +} + +void MKLDNNCTCLossNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + for (int i = 1; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNCTCLossNode::execute(mkldnn::stream strm) { + StatusCode returnCode = OK; + + const float* logits = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const int* logitsLength = reinterpret_cast(getParentEdgeAt(1)->getMemoryPtr()->GetPtr()); + const int* labels = reinterpret_cast(getParentEdgeAt(2)->getMemoryPtr()->GetPtr()); + const int* labelsLength = reinterpret_cast(getParentEdgeAt(3)->getMemoryPtr()->GetPtr()); + float* dstData = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + const size_t batchNum = getParentEdgeAt(0)->getDims()[0]; + const size_t maxTime = getParentEdgeAt(0)->getDims()[1]; + const size_t classesNum = getParentEdgeAt(0)->getDims()[2]; + + int blankIndex = classesNum - 1; + if (inDims.size() > 4) { + blankIndex = reinterpret_cast(getParentEdgeAt(4)->getMemoryPtr()->GetPtr())[0]; + } + + std::vector decodedTargetLenB(batchNum, 0); + std::vector> targetDB(batchNum); + std::vector>> logProbabilitiesB(batchNum); + std::vector errorMsgB(parallel_get_max_threads()); + + auto threadBody_1 = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(batchNum, nthr, ithr, start, end); + if (start >= end) + return; + + for (size_t b = start; b < end; b++) { + if (logitsLength[b] < 0 || labelsLength[b] < 0 || logitsLength[b] > maxTime || labelsLength[b] > logitsLength[b]) { + errorMsgB[ithr] = errorPrefix + ". Logit length cannot be greater than max sequence length. " + + "Label length cannot be greater than a logit length" + + " and both cannot be negative.\nMaxSeqLen: " + + std::to_string(maxTime) + "; Logit len: " + std::to_string(logitsLength[b]) + + "; Label len: " + std::to_string(labelsLength[b]); + returnCode = GENERAL_ERROR; + return; + } + const size_t actualLogitLen = logitsLength[b]; + const size_t actualTargetLen = labelsLength[b]; + size_t decodedTargetLen = 0lu; + + // Decoding target: merge repeated characters if preprocess_collapse_repeated == True, + // find unique elemnts if unique == True. + // Inserts blanks before each index and a blank at the end. + const int* target = &labels[b * maxTime]; + targetDB[b].resize(actualTargetLen * 2 + 1); + auto& targetD = targetDB[b]; + if (unique) { + std::unordered_set uniqVals; + for (size_t t = 0lu; t < actualTargetLen; t++) { + if (uniqVals.find(target[t]) != uniqVals.end()) { + continue; + } + uniqVals.insert(target[t]); + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = target[t]; + } + targetD[decodedTargetLen++] = blankIndex; + } else if (preprocessCollapseRepeated) { + auto prevValue = target[0]; + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = target[0]; + for (size_t t = 1lu; t < actualTargetLen; t++) { + if (target[t] == prevValue) { + continue; + } + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = prevValue = target[t]; + } + targetD[decodedTargetLen++] = blankIndex; + } else { + for (size_t t = 0lu; t < actualTargetLen; t++) { + targetD[decodedTargetLen++] = blankIndex; + targetD[decodedTargetLen++] = target[t]; + } + targetD[decodedTargetLen++] = blankIndex; + } + decodedTargetLenB[b] = decodedTargetLen; + + auto& logProbabilities = logProbabilitiesB[b]; + logProbabilities.resize(actualLogitLen); + for (size_t ll = 0; ll < actualLogitLen; ll++) { + logProbabilities[ll].resize(decodedTargetLen); + } + } // for batch + }; // threadBody_1 + + parallel_nt(0, threadBody_1); + if (returnCode != OK) { + std::string resErr(""); + for (auto& err : errorMsgB) { + if (!err.empty()) + resErr += err + "\n"; + } + IE_THROW() << resErr; + } + + const size_t TC = maxTime * classesNum; + + size_t workAmount2 = 0lu; + for (size_t b = 0; b < batchNum; b++) { + workAmount2 += logitsLength[b]; + } + + auto threadBody_2 = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + size_t sB(0lu), sT(0lu); + splitter(workAmount2, nthr, ithr, start, end); + if (start >= end) + return; + int64_t cw = 0, st = start; + for (; sB < batchNum; sB++) { + cw += logitsLength[sB]; + if (cw >= st) { + sT = logitsLength[sB] + st - cw; + break; + } + } + size_t workCounter = start; + + for (size_t b = sB; b < batchNum; b++) { + const size_t actualLogitLen = logitsLength[b]; + const size_t decodedTargetLen = decodedTargetLenB[b]; + auto& logProbabilities = logProbabilitiesB[b]; + auto& targetD = targetDB[b]; + + double expSum = 0.0; + size_t btcT = b * TC + sT * classesNum; + // logProbabilities = logSoftmax = logits[b][t][c] - ln(sum_c(exp(logits[b][t]))) + for (size_t t = sT; t < actualLogitLen; t++) { + expSum = 0.0; + for (size_t c = 0lu; c < classesNum; c++) { + expSum += std::exp(logits[btcT + c]); + } + for (size_t s = 0lu; s < decodedTargetLen; s++) { + logProbabilities[t][s] = logits[btcT + targetD[s]] - std::log(expSum); + } + btcT += classesNum; + if (++workCounter >= end) { + return; + } + } + sT = 0lu; + } // for batch + }; // threadBody_2 + + parallel_nt(0, threadBody_2); + + const auto float_inf = std::numeric_limits::infinity(); + + auto sumLogs = [&float_inf](float log1, float log2) { + if (log1 == -float_inf) { + return log2; + } else if (log2 == -float_inf) { + return log1; + } else { + if (log1 > log2) + return log1 + std::log1pf(std::exp(log2 - log1)); + else + return log2 + std::log1pf(std::exp(log1 - log2)); + } + }; + + auto threadBody_3 = [&](const int ithr, const int nthr) { + size_t start(0lu), end(0lu); + splitter(batchNum, nthr, ithr, start, end); + if (start >= end) + return; + + // As per Connectionist Temporal Classification - Labeling Unsegmented Sequence Data with Recurrent Neural Networks: + // Graves et al., 2016, paragraph 4.1 (10) + for (size_t b = start; b < end; b++) { + auto& targetD = targetDB[b]; + auto& logProbabilities = logProbabilitiesB[b]; + const int actualLogitLen = logitsLength[b]; + const int decodedTargetLen = decodedTargetLenB[b]; + std::vector> logBwd(decodedTargetLen, std::vector(actualLogitLen, -float_inf)); + for (int s = decodedTargetLen - 2; s < decodedTargetLen; s++) + logBwd[s][actualLogitLen - 1] = 0.f; + + for (int t = actualLogitLen - 2; t >= 0; t--) { + const int t_1 = t + 1; + for (int s = std::max(0, decodedTargetLen - (2 * (actualLogitLen - t))); + s < std::min(decodedTargetLen, 2 * (t_1)); s++) { + if (ctcMergeRepeated || targetD[s] == blankIndex) { + logBwd[s][t] = sumLogs(logBwd[s][t], + logBwd[s][t_1] + logProbabilities[t_1][s]); + } + + if (s + 1 < decodedTargetLen) { + logBwd[s][t] = sumLogs(logBwd[s][t], + logBwd[s + 1][t_1] + logProbabilities[t_1][s + 1]); + } + + if (s + 2 < decodedTargetLen) { + if (targetD[s] != blankIndex && (!ctcMergeRepeated || (targetD[s] != targetD[s + 2]))) { + logBwd[s][t] = sumLogs(logBwd[s][t], + logBwd[s + 2][t_1] + logProbabilities[t_1][s + 2]); + } + } + } + } + + logBwd[0][0] += logProbabilities[0][0]; + logBwd[1][0] += logProbabilities[0][(decodedTargetLen > 1) ? 1 : 0]; + + dstData[b] = -sumLogs(logBwd[0][0], logBwd[1][0]); + } // for batch + }; // threadBody_3 + + parallel_nt(0, threadBody_3); +} + +bool MKLDNNCTCLossNode::created() const { + return getType() == CTCLoss; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCTCLossNode, CTCLoss) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h new file mode 100644 index 00000000000000..b46ff413e829be --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_ctc_loss_node.h @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCTCLossNode : public MKLDNNNode { +public: + MKLDNNCTCLossNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + bool ctcMergeRepeated; + bool preprocessCollapseRepeated; + bool unique; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp new file mode 100644 index 00000000000000..3f6c8f903482ce --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.cpp @@ -0,0 +1,279 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "list.hpp" +#include "base.hpp" + +#include +#include + +#include +#include +#include "ie_parallel.hpp" +#include "ie_precision.hpp" +#include +#include "mkldnn_cum_sum_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNCumSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto cumsum = std::dynamic_pointer_cast(op); + if (!cumsum) { + errorMessage = "Only opset3 CumSum operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNCumSumNode::MKLDNNCumSumNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "CumSum layer with name '" + op->get_friendly_name() + "' "; + + if ((getOriginalInputsNumber() != numOfInputs && getOriginalInputsNumber() != (numOfInputs - 1)) || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + const auto &dataShape = op->get_input_shape(CUM_SUM_DATA); + if (dataShape.size() < 1) { + IE_THROW() << errorPrefix << " doesn't support 'data' input tensor with rank: " << dataShape.size(); + } + numOfDims = dataShape.size(); + + const auto cumsum = std::dynamic_pointer_cast(op); + exclusive = cumsum->is_exclusive(); + reverse = cumsum->is_reverse(); + + if (getOriginalInputsNumber() == numOfInputs) { + if (!ngraph::is_scalar(cumsum->get_input_shape(AXIS))) + IE_THROW() << errorPrefix << " doesn't support 'axis' input tensor with non scalar rank"; + } + + if (dataShape != cumsum->get_output_shape(0)) + IE_THROW() << errorPrefix << " has different 'data' input and output dimensions"; + + shape = dataShape; +} + +void MKLDNNCumSumNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + dataPrecision = getOriginalInputPrecisionAtPort(CUM_SUM_DATA); + if (dataPrecision != Precision::I8 && dataPrecision != Precision::U8 && dataPrecision != Precision::I16 && dataPrecision != Precision::I32 && + dataPrecision != Precision::FP32 && dataPrecision != Precision::I64 && dataPrecision != Precision::U64 && dataPrecision != Precision::BF16) + IE_THROW() << errorPrefix << " has unsupported 'data' input precision: " << dataPrecision.name(); + + if (getOriginalInputsNumber() == numOfInputs) { + const auto &axisTensorPrec = getOriginalInputPrecisionAtPort(AXIS); + if (axisTensorPrec != Precision::I32 && axisTensorPrec != Precision::I64) + IE_THROW() << errorPrefix << " has unsupported 'axis' input precision: " << axisTensorPrec.name(); + } + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, dataPrecision); + for (int i = 1; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, dataPrecision}}, + impl_desc_type::ref_any); +} + +void MKLDNNCumSumNode::execute(mkldnn::stream strm) { + if (inDims.size() == numOfInputs) + axis = getAxis(getParentEdgeAt(AXIS)->getBlob(), getParentEdgeAt(CUM_SUM_DATA)->getBlob()); + + switch (dataPrecision) { + case Precision::I8 : { + exec(); + break; + } + case Precision::U8 : { + exec(); + break; + } + case Precision::I16 : { + exec(); + break; + } + case Precision::I32 : { + exec(); + break; + } + case Precision::FP32 : { + exec(); + break; + } + case Precision::I64 : { + exec(); + break; + } + case Precision::U64 : { + exec(); + break; + } + default : { + std::string errorMsg = errorPrefix + " has unsupported 'data' input precision: " + dataPrecision.name(); + IE_THROW() << errorMsg; + } + } +} + + +template +void MKLDNNCumSumNode::exec() { + const auto *input = reinterpret_cast(getParentEdgeAt(CUM_SUM_DATA)->getMemoryPtr()->GetPtr()); + auto *output = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + const std::vector strides = getParentEdgeAt(CUM_SUM_DATA)->getDesc().getBlockingDesc().getStrides(); + + if (reverse) { + if (exclusive) { + cumSum(input, output, strides); + } else { + cumSum(input, output, strides); + } + } else { + if (exclusive) { + cumSum(input, output, strides); + } else { + cumSum(input, output, strides); + } + } +} + +template +void MKLDNNCumSumNode::cumSum(const dataType *input, dataType *output, const std::vector &strides) { + SizeVector iterationRange(numOfDims - 1); + size_t j = 0; + for (size_t i = 0; i < shape.size(); i++) { + if (i == axis) + continue; + iterationRange[j++] = shape[i]; + } + size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), 1, std::multiplies()); + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + SizeVector counters(numOfDims - 1, 0); + splitter(work_amount_dst, nthr, ithr, start, end); + + parallelItInit(start, counters, iterationRange); + + for (size_t iwork = start; iwork < end; ++iwork) { + std::vector forStartOffset(numOfDims); + forStartOffset[axis] = 0; + for (size_t offsetIdx = 0, countersIdx = 0; offsetIdx < numOfDims; ++offsetIdx) { + if (offsetIdx == axis) { + continue; + } + forStartOffset[offsetIdx] = counters[countersIdx++]; + } + + size_t startOffset = getStartOffset(forStartOffset, strides); + + const dataType *inputStart = input + startOffset; + dataType *outputStart = output + startOffset; + + size_t offset = strides[axis]; + if (reverse) { + if (exclusive) { + outputStart[offset*(shape[axis] - 1)] = 0; + for (int64_t i = shape[axis] - 2; i >= 0; i--) { + outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset]; + } + } else { + outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)]; + for (int64_t i = shape[axis] - 2; i >= 0; i--) { + outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset]; + } + } + } else { + if (exclusive) { + outputStart[0] = 0; + for (size_t i = 1; i < shape[axis]; i++) { + outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset]; + } + } else { + outputStart[0] = inputStart[0]; + for (size_t i = 1; i < shape[axis]; i++) { + outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset]; + } + } + } + + parallelItStep(counters, iterationRange); + } + }); +} + +void MKLDNNCumSumNode::parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange) { + auto itCounter = counters.rbegin(); + auto itWork = iterationRange.rbegin(); + while (itCounter != counters.rend() && itWork != iterationRange.rend()) { + *itCounter = start % *itWork; + start /= *itWork; + ++itCounter; + ++itWork; + } +} + +inline void MKLDNNCumSumNode::parallelItStep(std::vector& counters, const std::vector& iterationRange) { + auto itCounter = counters.rbegin(); + auto itWork = iterationRange.rbegin(); + + while (itCounter != counters.rend() && itWork != iterationRange.rend()) { + *itCounter = (*itCounter + 1) % *itWork; + if (*itCounter != 0) { + break; + } + ++itCounter; + ++itWork; + } +} + +inline size_t MKLDNNCumSumNode::getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const { + size_t startOffset = 0; + for (size_t idx = 0; idx < forStartOffset.size(); ++idx) { + startOffset += forStartOffset[idx] * strides[idx]; + } + return startOffset; +} + +size_t MKLDNNCumSumNode::getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) const { + const auto& axisPrecision = _axis->getTensorDesc().getPrecision(); + const int64_t dataShapeSize = static_cast(_data->getTensorDesc().getDims().size()); + int64_t axisValueFromBlob; + switch (axisPrecision) { + case Precision::I32 : { + const auto *axisPtr = _axis->cbuffer().as(); + axisValueFromBlob = static_cast(axisPtr[0]); + break; + } + case Precision::I64 : { + const auto *axisPtr = _axis->cbuffer().as(); + axisValueFromBlob = axisPtr[0]; + break; + } + default : { + IE_THROW() << errorPrefix << " doesn't support 'axis' input with precision: " << axisPrecision.name(); + } + } + if (axisValueFromBlob < -dataShapeSize || axisValueFromBlob > dataShapeSize - 1) + IE_THROW() << errorPrefix << " has axis with a value out of range: " << axisValueFromBlob; + return axisValueFromBlob >= 0 ? axisValueFromBlob : (axisValueFromBlob + dataShapeSize); +} + +bool MKLDNNCumSumNode::created() const { + return getType() == CumSum; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNCumSumNode, CumSum) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h new file mode 100644 index 00000000000000..794d6bc73f1722 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_cum_sum_node.h @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNCumSumNode : public MKLDNNNode { +public: + MKLDNNCumSumNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + template + void exec(); + + template + void cumSum(const dataType *input, dataType *output, const std::vector &strides); + + void parallelItInit(size_t start, std::vector& counters, const std::vector& iterationRange); + + inline void parallelItStep(std::vector& counters, const std::vector& iterationRange); + + inline size_t getStartOffset(const std::vector &forStartOffset, const std::vector& strides) const; + + size_t getAxis(const InferenceEngine::Blob::CPtr& _axis, const InferenceEngine::Blob::CPtr& _data) const; + + enum { CUM_SUM_DATA, AXIS, numOfInputs }; + bool exclusive; + bool reverse; + size_t numOfDims; + size_t axis = 0; + std::vector shape; + + InferenceEngine::Precision dataPrecision; + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp index dde4d960c5897e..a2fae182a52f70 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp @@ -741,7 +741,7 @@ struct jit_uni_def_conv_kernel_f32 : public jit_uni_def_conv_kernel, public jit_ bool MKLDNNDeformableConvolutionNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto defConvNode = ngraph::as_type_ptr(op); + const auto defConvNode = ngraph::as_type_ptr(op); if (!defConvNode) { errorMessage = "Node is not an instance of DeformableConvolution form the operation set v1."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp index 2f97bbd2f85bda..38bebcd5271072 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp @@ -58,6 +58,8 @@ MKLDNNDepthToSpaceNode::MKLDNNDepthToSpaceNode(const std::shared_ptr(std::pow(blockSize, nSpatialDims)); } else { IE_THROW(NotImplemented) << errorMessage; } @@ -74,14 +76,13 @@ void MKLDNNDepthToSpaceNode::getSupportedDescriptors() { if (srcDims.size() != dstDims.size()) THROW_ERROR << "has incorrect number of input/output dimensions"; - size_t nSpatialDims = srcDims.size() - 2; - blockStep = static_cast(std::pow(blockSize, nSpatialDims)); if (srcDims[1] % blockStep) THROW_ERROR << "has block_size parameter which is incompatible with input tensor channels dimension size"; if (srcDims[1] / blockStep != dstDims[1]) THROW_ERROR << "has incompatible input/output channels"; + size_t nSpatialDims = srcDims.size() - 2; for (size_t i = 0; i < nSpatialDims; ++i) { if (srcDims[i + 2] * blockSize != dstDims[i + 2]) THROW_ERROR << "has incompatible spatial dims"; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp new file mode 100644 index 00000000000000..4b8c695a987315 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.cpp @@ -0,0 +1,601 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_detection_output_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +template +static bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +bool MKLDNNDetectionOutputNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto doOp = ngraph::as_type_ptr(op); + if (!doOp) { + errorMessage = "Node is not an instance of the DetectionOutput from the operations set v0."; + return false; + } + if (!details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CENTER_SIZE") && + !details::CaselessEq()(doOp->get_attrs().code_type, "caffe.PriorBoxParameter.CORNER")) { + errorMessage = "Unsupported code_type attribute: " + doOp->get_attrs().code_type; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNDetectionOutputNode::MKLDNNDetectionOutputNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "DetectionOutput layer with name '" + op->get_friendly_name() + "' "; + + if (getOriginalInputsNumber() != 3 && getOriginalInputsNumber() != 5) + IE_THROW() << errorPrefix << " has incorrect number of input edges."; + + if (getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of output edges."; + + auto doOp = ngraph::as_type_ptr(op); + auto attributes = doOp->get_attrs(); + + _num_classes = attributes.num_classes; + _background_label_id = attributes.background_label_id; + _top_k = attributes.top_k; + _variance_encoded_in_target = attributes.variance_encoded_in_target; + _keep_top_k = attributes.keep_top_k[0]; + _nms_threshold = attributes.nms_threshold; + _confidence_threshold = attributes.confidence_threshold; + _share_location = attributes.share_location; + _clip_before_nms = attributes.clip_before_nms; + _clip_after_nms = attributes.clip_after_nms; + _decrease_label_id = attributes.decrease_label_id; + _normalized = attributes.normalized; + _image_height = attributes.input_height; + _image_width = attributes.input_width; + _prior_size = _normalized ? 4 : 5; + _offset = _normalized ? 0 : 1; + _num_loc_classes = _share_location ? 1 : _num_classes; + + with_add_box_pred = getOriginalInputsNumber() == 5; + _objectness_score = attributes.objectness_score; + + _code_type = (details::CaselessEq()(attributes.code_type, "caffe.PriorBoxParameter.CENTER_SIZE") ? + CodeType::CENTER_SIZE : CodeType::CORNER); + + _num_priors = static_cast(op->get_input_shape(idx_priors).back() / _prior_size); + _priors_batches = op->get_input_shape(idx_priors).front() != 1; + + if (_num_priors * _num_loc_classes * 4 != static_cast(op->get_input_shape(idx_location)[1])) + IE_THROW() << errorPrefix << " has incorrect number of priors must match number of location predictions (" + << _num_priors * _num_loc_classes * 4 << " vs " + << op->get_input_shape(idx_location)[1] << ")"; + + if (_num_priors * _num_classes != static_cast(op->get_input_shape(idx_confidence).back())) + IE_THROW() << " has incorrect number of priors must match number of confidence predictions."; + + if (_decrease_label_id && _background_label_id != 0) + IE_THROW() << errorPrefix << " cannot use decrease_label_id and background_label_id parameter simultaneously."; + + _num = static_cast(op->get_input_shape(idx_confidence)[0]); + + _decoded_bboxes.resize(_num * _num_classes * _num_priors * 4); + _buffer.resize(_num * _num_classes * _num_priors); + _indices.resize(_num * _num_classes * _num_priors); + _detections_count.resize(_num * _num_classes); + _bbox_sizes.resize(_num * _num_classes * _num_priors); + _num_priors_actual.resize(_num); + + const auto &confSize = op->get_input_shape(idx_confidence); + _reordered_conf.resize(std::accumulate(confSize.begin(), confSize.end(), 1, std::multiplies())); +} + +void MKLDNNDetectionOutputNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNDetectionOutputNode::execute(mkldnn::stream strm) { + float *dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + const float *loc_data = reinterpret_cast(getParentEdgeAt(idx_location)->getMemoryPtr()->GetPtr()); + const float *conf_data = reinterpret_cast(getParentEdgeAt(idx_confidence)->getMemoryPtr()->GetPtr()); + const float *prior_data = reinterpret_cast(getParentEdgeAt(idx_priors)->getMemoryPtr()->GetPtr()); + const float *arm_conf_data = inDims.size() > 3 ? + reinterpret_cast(getParentEdgeAt(idx_arm_confidence)->getMemoryPtr()->GetPtr()) : nullptr; + const float *arm_loc_data = inDims.size() > 4 ? + reinterpret_cast(getParentEdgeAt(idx_arm_location)->getMemoryPtr()->GetPtr()) : nullptr; + + const int N = getParentEdgeAt(idx_confidence)->getDims()[0]; + + float *decoded_bboxes_data = _decoded_bboxes.data(); + float *reordered_conf_data = _reordered_conf.data(); + float *bbox_sizes_data = _bbox_sizes.data(); + int *detections_data = _detections_count.data(); + int *buffer_data = _buffer.data(); + int *indices_data = _indices.data(); + int *num_priors_actual = _num_priors_actual.data(); + + for (int n = 0; n < N; ++n) { + const float *ppriors = prior_data; + const float *prior_variances = prior_data + _num_priors*_prior_size; + if (_priors_batches) { + ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size; + prior_variances += _variance_encoded_in_target ? 0 : 2*n*_num_priors*_prior_size; + } + + if (_share_location) { + const float *ploc = loc_data + n*4*_num_priors; + float *pboxes = decoded_bboxes_data + n*4*_num_priors; + float *psizes = bbox_sizes_data + n*_num_priors; + + if (with_add_box_pred) { + const float *p_arm_loc = arm_loc_data + n*4*_num_priors; + decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); + } else { + decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + } + } else { + for (int c = 0; c < _num_loc_classes; ++c) { + if (c == _background_label_id) { + continue; + } + const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4; + float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors; + float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors; + if (with_add_box_pred) { + const float *p_arm_loc = arm_loc_data + n*4*_num_loc_classes*_num_priors + c*4; + decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false); + } else { + decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size); + } + } + } + } + + if (with_add_box_pred) { + for (int n = 0; n < N; ++n) { + for (int p = 0; p < _num_priors; ++p) { + if (arm_conf_data[n*_num_priors*2 + p * 2 + 1] < _objectness_score) { + for (int c = 0; c < _num_classes; ++c) { + reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = c == _background_label_id ? 1.0f : 0.0f; + } + } else { + for (int c = 0; c < _num_classes; ++c) { + reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; + } + } + } + } + } else { + for (int n = 0; n < N; ++n) { + for (int c = 0; c < _num_classes; ++c) { + for (int p = 0; p < _num_priors; ++p) { + reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c]; + } + } + } + } + + memset(detections_data, 0, N*_num_classes*sizeof(int)); + + for (int n = 0; n < N; ++n) { + int detections_total = 0; + + if (!_decrease_label_id) { + // Caffe style + parallel_for(_num_classes, [&](int c) { + if (c != _background_label_id) { // Ignore background class + int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; + int *pbuffer = buffer_data + c*_num_priors; + int *pdetections = detections_data + n*_num_classes + c; + + const float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; + const float *pboxes; + const float *psizes; + if (_share_location) { + pboxes = decoded_bboxes_data + n*4*_num_priors; + psizes = bbox_sizes_data + n*_num_priors; + } else { + pboxes = decoded_bboxes_data + n*4*_num_classes*_num_priors + c*4*_num_priors; + psizes = bbox_sizes_data + n*_num_classes*_num_priors + c*_num_priors; + } + + nms_cf(pconf, pboxes, psizes, pbuffer, pindices, *pdetections, num_priors_actual[n]); + } + }); + } else { + // MXNet style + int *pindices = indices_data + n*_num_classes*_num_priors; + int *pbuffer = buffer_data; + int *pdetections = detections_data + n*_num_classes; + + const float *pconf = reordered_conf_data + n*_num_classes*_num_priors; + const float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors; + const float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors; + + nms_mx(pconf, pboxes, psizes, pbuffer, pindices, pdetections, _num_priors); + } + + for (int c = 0; c < _num_classes; ++c) { + detections_total += detections_data[n*_num_classes + c]; + } + + if (_keep_top_k > -1 && detections_total > _keep_top_k) { + std::vector>> conf_index_class_map; + + for (int c = 0; c < _num_classes; ++c) { + int detections = detections_data[n*_num_classes + c]; + int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors; + + float *pconf = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors; + + for (int i = 0; i < detections; ++i) { + int idx = pindices[i]; + conf_index_class_map.push_back(std::make_pair(pconf[idx], std::make_pair(c, idx))); + } + } + + std::sort(conf_index_class_map.begin(), conf_index_class_map.end(), + SortScorePairDescend>); + conf_index_class_map.resize(_keep_top_k); + + // Store the new indices. + memset(detections_data + n*_num_classes, 0, _num_classes * sizeof(int)); + + for (size_t j = 0; j < conf_index_class_map.size(); ++j) { + int label = conf_index_class_map[j].second.first; + int idx = conf_index_class_map[j].second.second; + int *pindices = indices_data + n * _num_classes * _num_priors + label * _num_priors; + pindices[detections_data[n*_num_classes + label]] = idx; + detections_data[n*_num_classes + label]++; + } + } + } + + const int num_results = getChildEdgesAtPort(0)[0]->getDims()[2]; + const int DETECTION_SIZE = getChildEdgesAtPort(0)[0]->getDims()[3]; + if (DETECTION_SIZE != 7) { + IE_THROW() << NOT_IMPLEMENTED; + } + + int dst_data_size = 0; + if (_keep_top_k > 0) + dst_data_size = N * _keep_top_k * DETECTION_SIZE * sizeof(float); + else if (_top_k > 0) + dst_data_size = N * _top_k * _num_classes * DETECTION_SIZE * sizeof(float); + else + dst_data_size = N * _num_classes * _num_priors * DETECTION_SIZE * sizeof(float); + + if (dst_data_size > getChildEdgesAtPort(0)[0]->getBlob()->byteSize()) { + IE_THROW() << OUT_OF_BOUNDS; + } + memset(dst_data, 0, dst_data_size); + + int count = 0; + for (int n = 0; n < N; ++n) { + const float *pconf = reordered_conf_data + n * _num_priors * _num_classes; + const float *pboxes = decoded_bboxes_data + n*_num_priors*4*_num_loc_classes; + const int *pindices = indices_data + n*_num_classes*_num_priors; + + for (int c = 0; c < _num_classes; ++c) { + for (int i = 0; i < detections_data[n*_num_classes + c]; ++i) { + int idx = pindices[c*_num_priors + i]; + + dst_data[count * DETECTION_SIZE + 0] = static_cast(n); + dst_data[count * DETECTION_SIZE + 1] = static_cast(_decrease_label_id ? c-1 : c); + dst_data[count * DETECTION_SIZE + 2] = pconf[c*_num_priors + idx]; + + float xmin = _share_location ? pboxes[idx*4 + 0] : + pboxes[c*4*_num_priors + idx*4 + 0]; + float ymin = _share_location ? pboxes[idx*4 + 1] : + pboxes[c*4*_num_priors + idx*4 + 1]; + float xmax = _share_location ? pboxes[idx*4 + 2] : + pboxes[c*4*_num_priors + idx*4 + 2]; + float ymax = _share_location ? pboxes[idx*4 + 3] : + pboxes[c*4*_num_priors + idx*4 + 3]; + + if (_clip_after_nms) { + xmin = (std::max)(0.0f, (std::min)(1.0f, xmin)); + ymin = (std::max)(0.0f, (std::min)(1.0f, ymin)); + xmax = (std::max)(0.0f, (std::min)(1.0f, xmax)); + ymax = (std::max)(0.0f, (std::min)(1.0f, ymax)); + } + + dst_data[count * DETECTION_SIZE + 3] = xmin; + dst_data[count * DETECTION_SIZE + 4] = ymin; + dst_data[count * DETECTION_SIZE + 5] = xmax; + dst_data[count * DETECTION_SIZE + 6] = ymax; + + ++count; + } + } + } + + if (count < num_results) { + // marker at end of boxes list + dst_data[count * DETECTION_SIZE + 0] = -1; + } +} + +struct ConfidenceComparator { + explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} + + bool operator()(int idx1, int idx2) { + if (_conf_data[idx1] > _conf_data[idx2]) return true; + if (_conf_data[idx1] < _conf_data[idx2]) return false; + return idx1 < idx2; + } + + const float* _conf_data; +}; + +static inline float JaccardOverlap(const float *decoded_bbox, + const float *bbox_sizes, + const int idx1, + const int idx2) { + float xmin1 = decoded_bbox[idx1*4 + 0]; + float ymin1 = decoded_bbox[idx1*4 + 1]; + float xmax1 = decoded_bbox[idx1*4 + 2]; + float ymax1 = decoded_bbox[idx1*4 + 3]; + + float xmin2 = decoded_bbox[idx2*4 + 0]; + float ymin2 = decoded_bbox[idx2*4 + 1]; + float xmax2 = decoded_bbox[idx2*4 + 2]; + float ymax2 = decoded_bbox[idx2*4 + 3]; + + if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { + return 0.0f; + } + + float intersect_xmin = (std::max)(xmin1, xmin2); + float intersect_ymin = (std::max)(ymin1, ymin2); + float intersect_xmax = (std::min)(xmax1, xmax2); + float intersect_ymax = (std::min)(ymax1, ymax2); + + float intersect_width = intersect_xmax - intersect_xmin; + float intersect_height = intersect_ymax - intersect_ymin; + + if (intersect_width <= 0 || intersect_height <= 0) { + return 0.0f; + } + + float intersect_size = intersect_width * intersect_height; + float bbox1_size = bbox_sizes[idx1]; + float bbox2_size = bbox_sizes[idx2]; + + return intersect_size / (bbox1_size + bbox2_size - intersect_size); +} + +void MKLDNNDetectionOutputNode::decodeBBoxes(const float *prior_data, + const float *loc_data, + const float *variance_data, + float *decoded_bboxes, + float *decoded_bbox_sizes, + int* num_priors_actual, + int n, + const int& offs, + const int& pr_size, + bool decodeType) { + num_priors_actual[n] = _num_priors; + if (!_normalized && decodeType) { + int num = 0; + for (; num < _num_priors; ++num) { + float batch_id = prior_data[num * pr_size + 0]; + if (batch_id == -1.f) { + num_priors_actual[n] = num; + break; + } + } + } + parallel_for(num_priors_actual[n], [&](int p) { + float new_xmin = 0.0f; + float new_ymin = 0.0f; + float new_xmax = 0.0f; + float new_ymax = 0.0f; + + float prior_xmin = prior_data[p*pr_size + 0 + offs]; + float prior_ymin = prior_data[p*pr_size + 1 + offs]; + float prior_xmax = prior_data[p*pr_size + 2 + offs]; + float prior_ymax = prior_data[p*pr_size + 3 + offs]; + + float loc_xmin = loc_data[4*p*_num_loc_classes + 0]; + float loc_ymin = loc_data[4*p*_num_loc_classes + 1]; + float loc_xmax = loc_data[4*p*_num_loc_classes + 2]; + float loc_ymax = loc_data[4*p*_num_loc_classes + 3]; + + if (!_normalized) { + prior_xmin /= _image_width; + prior_ymin /= _image_height; + prior_xmax /= _image_width; + prior_ymax /= _image_height; + } + + if (_code_type == CodeType::CORNER) { + if (_variance_encoded_in_target) { + // variance is encoded in target, we simply need to add the offset predictions. + new_xmin = prior_xmin + loc_xmin; + new_ymin = prior_ymin + loc_ymin; + new_xmax = prior_xmax + loc_xmax; + new_ymax = prior_ymax + loc_ymax; + } else { + new_xmin = prior_xmin + variance_data[p*4 + 0] * loc_xmin; + new_ymin = prior_ymin + variance_data[p*4 + 1] * loc_ymin; + new_xmax = prior_xmax + variance_data[p*4 + 2] * loc_xmax; + new_ymax = prior_ymax + variance_data[p*4 + 3] * loc_ymax; + } + } else if (_code_type == CodeType::CENTER_SIZE) { + float prior_width = prior_xmax - prior_xmin; + float prior_height = prior_ymax - prior_ymin; + float prior_center_x = (prior_xmin + prior_xmax) / 2.0f; + float prior_center_y = (prior_ymin + prior_ymax) / 2.0f; + + float decode_bbox_center_x, decode_bbox_center_y; + float decode_bbox_width, decode_bbox_height; + + if (_variance_encoded_in_target) { + // variance is encoded in target, we simply need to restore the offset predictions. + decode_bbox_center_x = loc_xmin * prior_width + prior_center_x; + decode_bbox_center_y = loc_ymin * prior_height + prior_center_y; + decode_bbox_width = std::exp(loc_xmax) * prior_width; + decode_bbox_height = std::exp(loc_ymax) * prior_height; + } else { + // variance is encoded in bbox, we need to scale the offset accordingly. + decode_bbox_center_x = variance_data[p*4 + 0] * loc_xmin * prior_width + prior_center_x; + decode_bbox_center_y = variance_data[p*4 + 1] * loc_ymin * prior_height + prior_center_y; + decode_bbox_width = std::exp(variance_data[p*4 + 2] * loc_xmax) * prior_width; + decode_bbox_height = std::exp(variance_data[p*4 + 3] * loc_ymax) * prior_height; + } + + new_xmin = decode_bbox_center_x - decode_bbox_width / 2.0f; + new_ymin = decode_bbox_center_y - decode_bbox_height / 2.0f; + new_xmax = decode_bbox_center_x + decode_bbox_width / 2.0f; + new_ymax = decode_bbox_center_y + decode_bbox_height / 2.0f; + } + + if (_clip_before_nms) { + new_xmin = (std::max)(0.0f, (std::min)(1.0f, new_xmin)); + new_ymin = (std::max)(0.0f, (std::min)(1.0f, new_ymin)); + new_xmax = (std::max)(0.0f, (std::min)(1.0f, new_xmax)); + new_ymax = (std::max)(0.0f, (std::min)(1.0f, new_ymax)); + } + + decoded_bboxes[p*4 + 0] = new_xmin; + decoded_bboxes[p*4 + 1] = new_ymin; + decoded_bboxes[p*4 + 2] = new_xmax; + decoded_bboxes[p*4 + 3] = new_ymax; + + decoded_bbox_sizes[p] = (new_xmax - new_xmin) * (new_ymax - new_ymin); + }); +} + +void MKLDNNDetectionOutputNode::nms_cf(const float* conf_data, + const float* bboxes, + const float* sizes, + int* buffer, + int* indices, + int& detections, + int num_priors_actual) { + int count = 0; + for (int i = 0; i < num_priors_actual; ++i) { + if (conf_data[i] > _confidence_threshold) { + indices[count] = i; + count++; + } + } + + int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); + + std::partial_sort_copy(indices, indices + count, + buffer, buffer + num_output_scores, + ConfidenceComparator(conf_data)); + + for (int i = 0; i < num_output_scores; ++i) { + const int idx = buffer[i]; + + bool keep = true; + for (int k = 0; k < detections; ++k) { + const int kept_idx = indices[k]; + float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); + if (overlap > _nms_threshold) { + keep = false; + break; + } + } + if (keep) { + indices[detections] = idx; + detections++; + } + } +} + +void MKLDNNDetectionOutputNode::nms_mx(const float* conf_data, + const float* bboxes, + const float* sizes, + int* buffer, + int* indices, + int* detections, + int num_priors_actual) { + int count = 0; + for (int i = 0; i < num_priors_actual; ++i) { + float conf = -1; + int id = 0; + for (int c = 1; c < _num_classes; ++c) { + float temp = conf_data[c*_num_priors + i]; + if (temp > conf) { + conf = temp; + id = c; + } + } + + if (id > 0 && conf >= _confidence_threshold) { + indices[count++] = id*_num_priors + i; + } + } + + int num_output_scores = (_top_k == -1 ? count : (std::min)(_top_k, count)); + + std::partial_sort_copy(indices, indices + count, + buffer, buffer + num_output_scores, + ConfidenceComparator(conf_data)); + + for (int i = 0; i < num_output_scores; ++i) { + const int idx = buffer[i]; + const int cls = idx/_num_priors; + const int prior = idx%_num_priors; + + int &ndetection = detections[cls]; + int *pindices = indices + cls*_num_priors; + + bool keep = true; + for (int k = 0; k < ndetection; ++k) { + const int kept_idx = pindices[k]; + float overlap = 0.0f; + if (_share_location) { + overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx); + } else { + overlap = JaccardOverlap(bboxes, sizes, cls*_num_priors + prior, cls*_num_priors + kept_idx); + } + if (overlap > _nms_threshold) { + keep = false; + break; + } + } + if (keep) { + pindices[ndetection++] = prior; + } + } +} + +bool MKLDNNDetectionOutputNode::created() const { + return getType() == DetectionOutput; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNDetectionOutputNode, DetectionOutput) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h new file mode 100644 index 00000000000000..dbf9bde760907c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_detection_output_node.h @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNDetectionOutputNode : public MKLDNNNode { +public: + MKLDNNDetectionOutputNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const int idx_location = 0; + const int idx_confidence = 1; + const int idx_priors = 2; + const int idx_arm_confidence = 3; + const int idx_arm_location = 4; + + int _num_classes = 0; + int _background_label_id = 0; + int _top_k = 0; + int _variance_encoded_in_target = 0; + int _keep_top_k = 0; + int _code_type = 0; + + bool _share_location = false; + bool _clip_before_nms = false; // clip bounding boxes before nms step + bool _clip_after_nms = false; // clip bounding boxes after nms step + bool _decrease_label_id = false; + + bool with_add_box_pred = false; + + int _image_width = 0; + int _image_height = 0; + int _prior_size = 4; + bool _normalized = true; + int _offset = 0; + + float _nms_threshold = 0.0f; + float _confidence_threshold = 0.0f; + float _objectness_score = 0.0f; + + int _num = 0; + int _num_loc_classes = 0; + int _num_priors = 0; + bool _priors_batches = false; + + enum CodeType { + CORNER = 1, + CENTER_SIZE = 2, + }; + + void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data, + float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size, + bool decodeType = true); // after ARM = false + + void nms_cf(const float *conf_data, const float *bboxes, const float *sizes, + int *buffer, int *indices, int &detections, int num_priors_actual); + + void nms_mx(const float *conf_data, const float *bboxes, const float *sizes, + int *buffer, int *indices, int *detections, int num_priors_actual); + + std::vector _decoded_bboxes; + std::vector _buffer; + std::vector _indices; + std::vector _detections_count; + std::vector _reordered_conf; + std::vector _bbox_sizes; + std::vector _num_priors_actual; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp index 21fb93728c17fb..b9ef511d010fce 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp @@ -49,7 +49,7 @@ MKLDNNDFTNode::MKLDNNDFTNode(const std::shared_ptr& op, const mkld /* Data */ inputShape = inDims[DATA_INDEX].ToSizeVector(); - if (inputShape.size() < 1) { + if (inputShape.size() < 2) { IE_THROW() << layerErrorPrefix << " has invalid 'data' input tensor with rank: " << inputShape.size(); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h index 6b565370917db7..34e95d45ae06e8 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h @@ -124,4 +124,3 @@ class MKLDNNEltwiseNode : public MKLDNNNode { }; } // namespace MKLDNNPlugin - diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp index c8810e4444b2a5..f59b69b023d99c 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_offset_sum_node.cpp @@ -13,7 +13,7 @@ using namespace InferenceEngine; bool MKLDNNEmbeddingBagOffsetSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto embBagOffsetSumOp = ngraph::as_type_ptr(op); + const auto embBagOffsetSumOp = ngraph::as_type_ptr(op); if (!embBagOffsetSumOp) { errorMessage = "Node is not an instance of the EmbeddingBagOffsetsSum operation from opset v3."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp index 4d1b808b502fb5..3318e1089faeed 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_bag_packed_sum_node.cpp @@ -13,7 +13,7 @@ using namespace InferenceEngine; bool MKLDNNEmbeddingBagPackedSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto embBagPackedSumOp = ngraph::as_type_ptr(op); + const auto embBagPackedSumOp = ngraph::as_type_ptr(op); if (!embBagPackedSumOp) { errorMessage = "Node is not an instance of the EmbeddingBagPackedSum operation from opset v3."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp index 798feecf7bd062..82eae04dcc2193 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_embedding_segments_sum_node.cpp @@ -13,7 +13,7 @@ using namespace InferenceEngine; bool MKLDNNEmbeddingSegmentsSumNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto embBagSegSumOp = ngraph::as_type_ptr(op); + const auto embBagSegSumOp = ngraph::as_type_ptr(op); if (!embBagSegSumOp) { errorMessage = "Node is not an instance of the EmbeddingSegmentsSum operation from opset v3."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp new file mode 100644 index 00000000000000..fe2362003f377a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.cpp @@ -0,0 +1,369 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_experimental_detectron_detection_output_node.h" + + +struct Indexer { + const std::vector dims_; + int total_{1}; + + explicit Indexer(const std::vector& dims) : dims_(dims) { + total_ = 1; + for (size_t i = 0; i < dims_.size(); ++i) { + total_ *= dims_[i]; + } + } + + int operator()(const std::vector& idx) const { + int flat_idx = 0; + assert(idx.size() == dims_.size()); + for (size_t i = 0; i < dims_.size(); ++i) { + assert(0 <= idx[i] && idx[i] < dims_[i]); + flat_idx = flat_idx * dims_[i] + idx[i]; + } + assert(flat_idx < total_); + return flat_idx; + } +}; + + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +static +void refine_boxes(const float* boxes, const float* deltas, const float* weights, const float* scores, + float* refined_boxes, float* refined_boxes_areas, float* refined_scores, + const int rois_num, const int classes_num, + const float img_H, const float img_W, + const float max_delta_log_wh, + float coordinates_offset) { + Indexer box_idx({rois_num, 4}); + Indexer delta_idx({rois_num, classes_num, 4}); + Indexer score_idx({rois_num, classes_num}); + + Indexer refined_box_idx({classes_num, rois_num, 4}); + Indexer refined_score_idx({classes_num, rois_num}); + + for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) { + float x0 = boxes[box_idx({roi_idx, 0})]; + float y0 = boxes[box_idx({roi_idx, 1})]; + float x1 = boxes[box_idx({roi_idx, 2})]; + float y1 = boxes[box_idx({roi_idx, 3})]; + + if (x1 - x0 <= 0 || y1 - y0 <= 0) { + continue; + } + + // width & height of box + const float ww = x1 - x0 + coordinates_offset; + const float hh = y1 - y0 + coordinates_offset; + // center location of box + const float ctr_x = x0 + 0.5f * ww; + const float ctr_y = y0 + 0.5f * hh; + + for (int class_idx = 1; class_idx < classes_num; ++class_idx) { + const float dx = deltas[delta_idx({roi_idx, class_idx, 0})] / weights[0]; + const float dy = deltas[delta_idx({roi_idx, class_idx, 1})] / weights[1]; + const float d_log_w = deltas[delta_idx({roi_idx, class_idx, 2})] / weights[2]; + const float d_log_h = deltas[delta_idx({roi_idx, class_idx, 3})] / weights[3]; + + // new center location according to deltas (dx, dy) + const float pred_ctr_x = dx * ww + ctr_x; + const float pred_ctr_y = dy * hh + ctr_y; + // new width & height according to deltas d(log w), d(log h) + const float pred_w = std::exp((std::min)(d_log_w, max_delta_log_wh)) * ww; + const float pred_h = std::exp((std::min)(d_log_h, max_delta_log_wh)) * hh; + + // update upper-left corner location + float x0_new = pred_ctr_x - 0.5f * pred_w; + float y0_new = pred_ctr_y - 0.5f * pred_h; + // update lower-right corner location + float x1_new = pred_ctr_x + 0.5f * pred_w - coordinates_offset; + float y1_new = pred_ctr_y + 0.5f * pred_h - coordinates_offset; + + // adjust new corner locations to be within the image region, + x0_new = std::max(0.0f, x0_new); + y0_new = std::max(0.0f, y0_new); + x1_new = std::max(0.0f, x1_new); + y1_new = std::max(0.0f, y1_new); + + // recompute new width & height + const float box_w = x1_new - x0_new + coordinates_offset; + const float box_h = y1_new - y0_new + coordinates_offset; + + refined_boxes[refined_box_idx({class_idx, roi_idx, 0})] = x0_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 1})] = y0_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 2})] = x1_new; + refined_boxes[refined_box_idx({class_idx, roi_idx, 3})] = y1_new; + + refined_boxes_areas[refined_score_idx({class_idx, roi_idx})] = box_w * box_h; + + refined_scores[refined_score_idx({class_idx, roi_idx})] = scores[score_idx({roi_idx, class_idx})]; + } + } +} + +template +static bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + + +struct ConfidenceComparator { + explicit ConfidenceComparator(const float* conf_data) : _conf_data(conf_data) {} + + bool operator()(int idx1, int idx2) { + if (_conf_data[idx1] > _conf_data[idx2]) return true; + if (_conf_data[idx1] < _conf_data[idx2]) return false; + return idx1 < idx2; + } + + const float* _conf_data; +}; + +static inline float JaccardOverlap(const float *decoded_bbox, + const float *bbox_sizes, + const int idx1, + const int idx2, + const float coordinates_offset = 1) { + float xmin1 = decoded_bbox[idx1 * 4 + 0]; + float ymin1 = decoded_bbox[idx1 * 4 + 1]; + float xmax1 = decoded_bbox[idx1 * 4 + 2]; + float ymax1 = decoded_bbox[idx1 * 4 + 3]; + + float xmin2 = decoded_bbox[idx2 * 4 + 0]; + float ymin2 = decoded_bbox[idx2 * 4 + 1]; + float ymax2 = decoded_bbox[idx2 * 4 + 3]; + float xmax2 = decoded_bbox[idx2 * 4 + 2]; + + if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) { + return 0.0f; + } + + float intersect_xmin = (std::max)(xmin1, xmin2); + float intersect_ymin = (std::max)(ymin1, ymin2); + float intersect_xmax = (std::min)(xmax1, xmax2); + float intersect_ymax = (std::min)(ymax1, ymax2); + + float intersect_width = intersect_xmax - intersect_xmin + coordinates_offset; + float intersect_height = intersect_ymax - intersect_ymin + coordinates_offset; + + if (intersect_width <= 0 || intersect_height <= 0) { + return 0.0f; + } + + float intersect_size = intersect_width * intersect_height; + float bbox1_size = bbox_sizes[idx1]; + float bbox2_size = bbox_sizes[idx2]; + + return intersect_size / (bbox1_size + bbox2_size - intersect_size); +} + + +static void nms_cf(const float* conf_data, + const float* bboxes, + const float* sizes, + int* buffer, + int* indices, + int& detections, + const int boxes_num, + const int pre_nms_topn, + const int post_nms_topn, + const float confidence_threshold, + const float nms_threshold) { + int count = 0; + for (int i = 0; i < boxes_num; ++i) { + if (conf_data[i] > confidence_threshold) { + indices[count] = i; + count++; + } + } + + int num_output_scores = (pre_nms_topn == -1 ? count : (std::min)(pre_nms_topn, count)); + + std::partial_sort_copy(indices, indices + count, + buffer, buffer + num_output_scores, + ConfidenceComparator(conf_data)); + + detections = 0; + for (int i = 0; i < num_output_scores; ++i) { + const int idx = buffer[i]; + + bool keep = true; + for (int k = 0; k < detections; ++k) { + const int kept_idx = indices[k]; + float overlap = JaccardOverlap(bboxes, sizes, idx, kept_idx); + if (overlap > nms_threshold) { + keep = false; + break; + } + } + if (keep) { + indices[detections] = idx; + detections++; + } + } + + detections = (post_nms_topn == -1 ? detections : (std::min)(post_nms_topn, detections)); +} + +bool MKLDNNExperimentalDetectronDetectionOutputNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto doOp = ngraph::as_type_ptr(op); + if (!doOp) { + errorMessage = "Node is not an instance of the ExperimentalDetectronDetectionOutput from the operations set v6."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronDetectionOutputNode::MKLDNNExperimentalDetectronDetectionOutputNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + auto doOp = ngraph::as_type_ptr(op); + auto attributes = doOp->get_attrs(); + + score_threshold_ = attributes.score_threshold; + nms_threshold_ = attributes.nms_threshold; + max_delta_log_wh_ = attributes.max_delta_log_wh; + classes_num_ = attributes.num_classes; + max_detections_per_class_ = attributes.post_nms_count; + max_detections_per_image_ = attributes.max_detections_per_image; + class_agnostic_box_regression_ = attributes.class_agnostic_box_regression; + deltas_weights_ = attributes.deltas_weights; +} + +void MKLDNNExperimentalDetectronDetectionOutputNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::I32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronDetectionOutputNode::execute(mkldnn::stream strm) { + const int rois_num = getParentEdgeAt(INPUT_ROIS)->getDims()[0]; + assert(classes_num_ == static_cast(getParentEdgeAt(INPUT_SCORES)->getDims()[1])); + assert(4 * classes_num_ == static_cast(getParentEdgeAt(INPUT_DELTAS)->getDims()[1])); + + const auto* boxes = reinterpret_cast(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr()); + const auto* deltas = reinterpret_cast(getParentEdgeAt(INPUT_DELTAS)->getMemoryPtr()->GetPtr()); + const auto* scores = reinterpret_cast(getParentEdgeAt(INPUT_SCORES)->getMemoryPtr()->GetPtr()); + const auto* im_info = reinterpret_cast(getParentEdgeAt(INPUT_IM_INFO)->getMemoryPtr()->GetPtr()); + + auto* output_boxes = reinterpret_cast(getChildEdgesAtPort(OUTPUT_BOXES)[0]->getMemoryPtr()->GetPtr()); + auto* output_scores = reinterpret_cast(getChildEdgesAtPort(OUTPUT_SCORES)[0]->getMemoryPtr()->GetPtr()); + auto* output_classes = reinterpret_cast(getChildEdgesAtPort(OUTPUT_CLASSES)[0]->getMemoryPtr()->GetPtr()); + + const float img_H = im_info[0]; + const float img_W = im_info[1]; + + // Apply deltas. + std::vector refined_boxes(classes_num_ * rois_num * 4, 0); + std::vector refined_scores(classes_num_ * rois_num, 0); + std::vector refined_boxes_areas(classes_num_ * rois_num, 0); + Indexer refined_box_idx({classes_num_, rois_num, 4}); + Indexer refined_score_idx({classes_num_, rois_num}); + + refine_boxes(boxes, deltas, &deltas_weights_[0], scores, + &refined_boxes[0], &refined_boxes_areas[0], &refined_scores[0], + rois_num, classes_num_, + img_H, img_W, + max_delta_log_wh_, + 1.0f); + + // Apply NMS class-wise. + std::vector buffer(rois_num, 0); + std::vector indices(classes_num_ * rois_num, 0); + std::vector detections_per_class(classes_num_, 0); + int total_detections_num = 0; + + for (int class_idx = 1; class_idx < classes_num_; ++class_idx) { + nms_cf(&refined_scores[refined_score_idx({class_idx, 0})], + &refined_boxes[refined_box_idx({class_idx, 0, 0})], + &refined_boxes_areas[refined_score_idx({class_idx, 0})], + &buffer[0], + &indices[total_detections_num], + detections_per_class[class_idx], + rois_num, + -1, + max_detections_per_class_, + score_threshold_, + nms_threshold_); + total_detections_num += detections_per_class[class_idx]; + } + + // Leave only max_detections_per_image_ detections. + // confidence, + std::vector>> conf_index_class_map; + + int indices_offset = 0; + for (int c = 0; c < classes_num_; ++c) { + int n = detections_per_class[c]; + for (int i = 0; i < n; ++i) { + int idx = indices[indices_offset + i]; + float score = refined_scores[refined_score_idx({c, idx})]; + conf_index_class_map.push_back(std::make_pair(score, std::make_pair(c, idx))); + } + indices_offset += n; + } + + assert(max_detections_per_image_ > 0); + if (total_detections_num > max_detections_per_image_) { + std::partial_sort(conf_index_class_map.begin(), + conf_index_class_map.begin() + max_detections_per_image_, + conf_index_class_map.end(), + SortScorePairDescend>); + conf_index_class_map.resize(max_detections_per_image_); + total_detections_num = max_detections_per_image_; + } + + // Fill outputs. + memset(output_boxes, 0, max_detections_per_image_ * 4 * sizeof(output_boxes[0])); + memset(output_scores, 0, max_detections_per_image_ * sizeof(output_scores[0])); + memset(output_classes, 0, max_detections_per_image_ * sizeof(output_classes[0])); + + int i = 0; + for (const auto & detection : conf_index_class_map) { + float score = detection.first; + int cls = detection.second.first; + int idx = detection.second.second; + output_boxes[4 * i + 0] = refined_boxes[refined_box_idx({cls, idx, 0})]; + output_boxes[4 * i + 1] = refined_boxes[refined_box_idx({cls, idx, 1})]; + output_boxes[4 * i + 2] = refined_boxes[refined_box_idx({cls, idx, 2})]; + output_boxes[4 * i + 3] = refined_boxes[refined_box_idx({cls, idx, 3})]; + output_scores[i] = score; + output_classes[i] = cls; + ++i; + } +} + +bool MKLDNNExperimentalDetectronDetectionOutputNode::created() const { + return getType() == ExperimentalDetectronDetectionOutput; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronDetectionOutputNode, ExperimentalDetectronDetectionOutput) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h new file mode 100644 index 00000000000000..2df28ce5c4983b --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_detection_output_node.h @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronDetectionOutputNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronDetectionOutputNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const int INPUT_ROIS {0}; + const int INPUT_DELTAS {1}; + const int INPUT_SCORES {2}; + const int INPUT_IM_INFO {3}; + + const int OUTPUT_BOXES {0}; + const int OUTPUT_CLASSES {1}; + const int OUTPUT_SCORES {2}; + + float score_threshold_; + float nms_threshold_; + float max_delta_log_wh_; + int classes_num_; + int max_detections_per_class_; + int max_detections_per_image_; + bool class_agnostic_box_regression_; + std::vector deltas_weights_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp new file mode 100644 index 00000000000000..255f8443765660 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.cpp @@ -0,0 +1,429 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#if defined(HAVE_AVX2) +#include +#endif + +#include +#include "ie_parallel.hpp" +#include "common/cpu_memcpy.h" +#include "mkldnn_experimental_detectron_generate_proposals_single_image_node.h" + +namespace { +struct Indexer4d { + int dim3_; + int dim23_; + int dim123_; + + explicit Indexer4d(int dim0, int dim1, int dim2, int dim3): + dim3_(dim3), dim23_(dim2 * dim3), dim123_(dim1 * dim2 * dim3) { + (void)dim0; + } + + int operator()(int i, int j, int k, int n) const { + return i * dim123_ + j * dim23_ + k * dim3_ + n; + } +}; +} // namespace + + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +static +void refine_anchors(const float* deltas, const float* scores, const float* anchors, + float* proposals, const int anchors_num, const int bottom_H, + const int bottom_W, const float img_H, const float img_W, + const float min_box_H, const float min_box_W, + const float max_delta_log_wh, + float coordinates_offset) { + Indexer4d delta_idx(anchors_num, 4, bottom_H, bottom_W); + Indexer4d score_idx(anchors_num, 1, bottom_H, bottom_W); + Indexer4d proposal_idx(bottom_H, bottom_W, anchors_num, 5); + Indexer4d anchor_idx(bottom_H, bottom_W, anchors_num, 4); + + parallel_for2d(bottom_H, bottom_W, [&](int h, int w) { + for (int anchor = 0; anchor < anchors_num; ++anchor) { + int a_idx = anchor_idx(h, w, anchor, 0); + float x0 = anchors[a_idx + 0]; + float y0 = anchors[a_idx + 1]; + float x1 = anchors[a_idx + 2]; + float y1 = anchors[a_idx + 3]; + + const float dx = deltas[delta_idx(anchor, 0, h, w)]; + const float dy = deltas[delta_idx(anchor, 1, h, w)]; + const float d_log_w = deltas[delta_idx(anchor, 2, h, w)]; + const float d_log_h = deltas[delta_idx(anchor, 3, h, w)]; + + const float score = scores[score_idx(anchor, 0, h, w)]; + + // width & height of box + const float ww = x1 - x0 + coordinates_offset; + const float hh = y1 - y0 + coordinates_offset; + // center location of box + const float ctr_x = x0 + 0.5f * ww; + const float ctr_y = y0 + 0.5f * hh; + + // new center location according to deltas (dx, dy) + const float pred_ctr_x = dx * ww + ctr_x; + const float pred_ctr_y = dy * hh + ctr_y; + // new width & height according to deltas d(log w), d(log h) + const float pred_w = std::exp(std::min(d_log_w, max_delta_log_wh)) * ww; + const float pred_h = std::exp(std::min(d_log_h, max_delta_log_wh)) * hh; + + // update upper-left corner location + x0 = pred_ctr_x - 0.5f * pred_w; + y0 = pred_ctr_y - 0.5f * pred_h; + // update lower-right corner location + x1 = pred_ctr_x + 0.5f * pred_w - coordinates_offset; + y1 = pred_ctr_y + 0.5f * pred_h - coordinates_offset; + + // adjust new corner locations to be within the image region, + x0 = std::max(0.0f, std::min(x0, img_W - coordinates_offset)); + y0 = std::max(0.0f, std::min(y0, img_H - coordinates_offset)); + x1 = std::max(0.0f, std::min(x1, img_W - coordinates_offset)); + y1 = std::max(0.0f, std::min(y1, img_H - coordinates_offset)); + + // recompute new width & height + const float box_w = x1 - x0 + coordinates_offset; + const float box_h = y1 - y0 + coordinates_offset; + + int p_idx = proposal_idx(h, w, anchor, 0); + proposals[p_idx + 0] = x0; + proposals[p_idx + 1] = y0; + proposals[p_idx + 2] = x1; + proposals[p_idx + 3] = y1; + proposals[p_idx + 4] = (min_box_W <= box_w) * (min_box_H <= box_h) * score; + } + }); +} + +static void unpack_boxes(const float* p_proposals, float* unpacked_boxes, int pre_nms_topn) { + parallel_for(pre_nms_topn, [&](size_t i) { + unpacked_boxes[0*pre_nms_topn + i] = p_proposals[5*i + 0]; + unpacked_boxes[1*pre_nms_topn + i] = p_proposals[5*i + 1]; + unpacked_boxes[2*pre_nms_topn + i] = p_proposals[5*i + 2]; + unpacked_boxes[3*pre_nms_topn + i] = p_proposals[5*i + 3]; + unpacked_boxes[4*pre_nms_topn + i] = p_proposals[5*i + 4]; + }); +} + +static +void nms_cpu(const int num_boxes, int is_dead[], + const float* boxes, int index_out[], int* const num_out, + const int base_index, const float nms_thresh, const int max_num_out, + float coordinates_offset) { + const int num_proposals = num_boxes; + int count = 0; + + const float* x0 = boxes + 0 * num_proposals; + const float* y0 = boxes + 1 * num_proposals; + const float* x1 = boxes + 2 * num_proposals; + const float* y1 = boxes + 3 * num_proposals; + + std::memset(is_dead, 0, num_boxes * sizeof(int)); + +#if defined(HAVE_AVX2) + __m256 vc_fone = _mm256_set1_ps(coordinates_offset); + __m256i vc_ione = _mm256_set1_epi32(1); + __m256 vc_zero = _mm256_set1_ps(0.0f); + + __m256 vc_nms_thresh = _mm256_set1_ps(nms_thresh); +#endif + + for (int box = 0; box < num_boxes; ++box) { + if (is_dead[box]) + continue; + + index_out[count++] = base_index + box; + if (count == max_num_out) + break; + + int tail = box + 1; + +#if defined(HAVE_AVX2) + __m256 vx0i = _mm256_set1_ps(x0[box]); + __m256 vy0i = _mm256_set1_ps(y0[box]); + __m256 vx1i = _mm256_set1_ps(x1[box]); + __m256 vy1i = _mm256_set1_ps(y1[box]); + + __m256 vA_width = _mm256_sub_ps(vx1i, vx0i); + __m256 vA_height = _mm256_sub_ps(vy1i, vy0i); + __m256 vA_area = _mm256_mul_ps(_mm256_add_ps(vA_width, vc_fone), _mm256_add_ps(vA_height, vc_fone)); + + for (; tail <= num_boxes - 8; tail += 8) { + __m256i *pdst = reinterpret_cast<__m256i*>(is_dead + tail); + __m256i vdst = _mm256_loadu_si256(pdst); + + __m256 vx0j = _mm256_loadu_ps(x0 + tail); + __m256 vy0j = _mm256_loadu_ps(y0 + tail); + __m256 vx1j = _mm256_loadu_ps(x1 + tail); + __m256 vy1j = _mm256_loadu_ps(y1 + tail); + + __m256 vx0 = _mm256_max_ps(vx0i, vx0j); + __m256 vy0 = _mm256_max_ps(vy0i, vy0j); + __m256 vx1 = _mm256_min_ps(vx1i, vx1j); + __m256 vy1 = _mm256_min_ps(vy1i, vy1j); + + __m256 vwidth = _mm256_add_ps(_mm256_sub_ps(vx1, vx0), vc_fone); + __m256 vheight = _mm256_add_ps(_mm256_sub_ps(vy1, vy0), vc_fone); + __m256 varea = _mm256_mul_ps(_mm256_max_ps(vc_zero, vwidth), _mm256_max_ps(vc_zero, vheight)); + + __m256 vB_width = _mm256_sub_ps(vx1j, vx0j); + __m256 vB_height = _mm256_sub_ps(vy1j, vy0j); + __m256 vB_area = _mm256_mul_ps(_mm256_add_ps(vB_width, vc_fone), _mm256_add_ps(vB_height, vc_fone)); + + __m256 vdivisor = _mm256_sub_ps(_mm256_add_ps(vA_area, vB_area), varea); + __m256 vintersection_area = _mm256_div_ps(varea, vdivisor); + + __m256 vcmp_0 = _mm256_cmp_ps(vx0i, vx1j, _CMP_LE_OS); + __m256 vcmp_1 = _mm256_cmp_ps(vy0i, vy1j, _CMP_LE_OS); + __m256 vcmp_2 = _mm256_cmp_ps(vx0j, vx1i, _CMP_LE_OS); + __m256 vcmp_3 = _mm256_cmp_ps(vy0j, vy1i, _CMP_LE_OS); + __m256 vcmp_4 = _mm256_cmp_ps(vc_nms_thresh, vintersection_area, _CMP_LT_OS); + + vcmp_0 = _mm256_and_ps(vcmp_0, vcmp_1); + vcmp_2 = _mm256_and_ps(vcmp_2, vcmp_3); + vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_0); + vcmp_4 = _mm256_and_ps(vcmp_4, vcmp_2); + + _mm256_storeu_si256(pdst, _mm256_blendv_epi8(vdst, vc_ione, _mm256_castps_si256(vcmp_4))); + } +#endif + + for (; tail < num_boxes; ++tail) { + float res = 0.0f; + + const float x0i = x0[box]; + const float y0i = y0[box]; + const float x1i = x1[box]; + const float y1i = y1[box]; + + const float x0j = x0[tail]; + const float y0j = y0[tail]; + const float x1j = x1[tail]; + const float y1j = y1[tail]; + + if (x0i <= x1j && y0i <= y1j && x0j <= x1i && y0j <= y1i) { + // overlapped region (= box) + const float x0 = std::max(x0i, x0j); + const float y0 = std::max(y0i, y0j); + const float x1 = std::min(x1i, x1j); + const float y1 = std::min(y1i, y1j); + + // intersection area + const float width = std::max(0.0f, x1 - x0 + coordinates_offset); + const float height = std::max(0.0f, y1 - y0 + coordinates_offset); + const float area = width * height; + + // area of A, B + const float A_area = (x1i - x0i + coordinates_offset) * (y1i - y0i + coordinates_offset); + const float B_area = (x1j - x0j + coordinates_offset) * (y1j - y0j + coordinates_offset); + + // IoU + res = area / (A_area + B_area - area); + } + + if (nms_thresh < res) + is_dead[tail] = 1; + } + } + + *num_out = count; +} + + +static +void fill_output_blobs(const float* proposals, const int* roi_indices, + float* rois, float* scores, + const int num_proposals, const int num_rois, const int post_nms_topn) { + const float *src_x0 = proposals + 0 * num_proposals; + const float *src_y0 = proposals + 1 * num_proposals; + const float *src_x1 = proposals + 2 * num_proposals; + const float *src_y1 = proposals + 3 * num_proposals; + const float *src_score = proposals + 4 * num_proposals; + + parallel_for(num_rois, [&](size_t i) { + int index = roi_indices[i]; + rois[i * 4 + 0] = src_x0[index]; + rois[i * 4 + 1] = src_y0[index]; + rois[i * 4 + 2] = src_x1[index]; + rois[i * 4 + 3] = src_y1[index]; + scores[i] = src_score[index]; + }); + + if (num_rois < post_nms_topn) { + for (int i = 4 * num_rois; i < 4 * post_nms_topn; i++) { + rois[i] = 0.f; + } + for (int i = num_rois; i < post_nms_topn; i++) { + scores[i] = 0.f; + } + } +} + +bool MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::isSupportedOperation + (const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto proposalOp = ngraph::as_type_ptr(op); + if (!proposalOp) { + errorMessage = "Node is not an instance of the Proposal from the operations set v0."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + auto proposalOp = ngraph::as_type_ptr(op); + auto proposalAttrs = proposalOp->get_attrs(); + + min_size_ = proposalAttrs.min_size; + nms_thresh_ = proposalAttrs.nms_threshold; + pre_nms_topn_ = proposalAttrs.pre_nms_count; + post_nms_topn_ = proposalAttrs.post_nms_count; + + coordinates_offset = 0.0f; + + roi_indices_.resize(post_nms_topn_); +} + +void MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::execute(mkldnn::stream strm) { + try { + if (inDims.size() != 4 || outDims.size() != 2) { + IE_THROW() << "Incorrect number of input or output edges!"; + } + + size_t anchor_dims_size = 1; + for (size_t i = 0; i < getParentEdgeAt(INPUT_ANCHORS)->getDims().ToSizeVector().size(); i++) { + anchor_dims_size *= getParentEdgeAt(INPUT_ANCHORS)->getDims().ToSizeVector()[i]; + } + + size_t deltas_dims_size = 1; + for (size_t i = 0; i < getParentEdgeAt(INPUT_DELTAS)->getDims().ToSizeVector().size(); i++) { + deltas_dims_size *= getParentEdgeAt(INPUT_DELTAS)->getDims().ToSizeVector()[i]; + } + if (anchor_dims_size != deltas_dims_size) + IE_THROW() << "'Anchors' blob size for ONNXProposal is incompatible with 'deltas' blob size!"; + + size_t score_dims_size = 1; + for (size_t i = 0; i < getParentEdgeAt(INPUT_SCORES)->getDims().ToSizeVector().size(); i++) { + score_dims_size *= getParentEdgeAt(INPUT_SCORES)->getDims().ToSizeVector()[i]; + } + if (deltas_dims_size != (4 * score_dims_size)) + IE_THROW() << "'Deltas' blob size for ONNXProposal is incompatible with 'scores' blob size!"; + + // Prepare memory + const float *p_deltas_item = reinterpret_cast(getParentEdgeAt(INPUT_DELTAS)->getMemoryPtr()->GetPtr()); + const float *p_scores_item = reinterpret_cast(getParentEdgeAt(INPUT_SCORES)->getMemoryPtr()->GetPtr()); + const float *p_anchors_item = reinterpret_cast(getParentEdgeAt(INPUT_ANCHORS)->getMemoryPtr()->GetPtr()); + const float *p_img_info_cpu = reinterpret_cast(getParentEdgeAt(INPUT_IM_INFO)->getMemoryPtr()->GetPtr()); + + float *p_roi_item = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + float *p_roi_score_item = reinterpret_cast(getChildEdgesAtPort(OUTPUT_SCORES)[0]->getMemoryPtr()->GetPtr()); + + const int anchors_num = getParentEdgeAt(INPUT_SCORES)->getDims()[0]; + + // bottom shape: (num_anchors) x H x W + const int bottom_H = getParentEdgeAt(INPUT_DELTAS)->getDims()[1]; + const int bottom_W = getParentEdgeAt(INPUT_DELTAS)->getDims()[2]; + + // input image height & width + const float img_H = p_img_info_cpu[0]; + const float img_W = p_img_info_cpu[1]; + + // scale factor for height & width + + // minimum box width & height + const float min_box_H = min_size_; + const float min_box_W = min_size_; + + // number of all proposals = num_anchors * H * W + const int num_proposals = anchors_num * bottom_H * bottom_W; + + // number of top-n proposals before NMS + const int pre_nms_topn = std::min(num_proposals, pre_nms_topn_); + + // number of final RoIs + int num_rois = 0; + + // enumerate all proposals + // num_proposals = num_anchors * H * W + // (x1, y1, x2, y2, score) for each proposal + // NOTE: for bottom, only foreground scores are passed + struct ProposalBox { + float x0; + float y0; + float x1; + float y1; + float score; + }; + std::vector proposals_(num_proposals); + std::vector unpacked_boxes(5 * pre_nms_topn); + std::vector is_dead(pre_nms_topn); + + // Execute + int batch_size = 1; // inputs[INPUT_DELTAS]->getTensorDesc().getDims()[0]; + for (int n = 0; n < batch_size; ++n) { + refine_anchors(p_deltas_item, p_scores_item, p_anchors_item, + reinterpret_cast(&proposals_[0]), anchors_num, bottom_H, + bottom_W, img_H, img_W, + min_box_H, min_box_W, + static_cast(log(1000. / 16.)), + 1.0f); + std::partial_sort(proposals_.begin(), proposals_.begin() + pre_nms_topn, proposals_.end(), + [](const ProposalBox &struct1, const ProposalBox &struct2) { + return (struct1.score > struct2.score); + }); + + unpack_boxes(reinterpret_cast(&proposals_[0]), &unpacked_boxes[0], pre_nms_topn); + nms_cpu(pre_nms_topn, &is_dead[0], &unpacked_boxes[0], &roi_indices_[0], &num_rois, 0, + nms_thresh_, post_nms_topn_, coordinates_offset); + fill_output_blobs(&unpacked_boxes[0], &roi_indices_[0], p_roi_item, p_roi_score_item, + pre_nms_topn, num_rois, post_nms_topn_); + } + } catch (const std::exception &e) { + std::string errorMsg = e.what(); + IE_THROW() << errorMsg; + } +} + +bool MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode::created() const { + return getType() == ExperimentalDetectronGenerateProposalsSingleImage; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode, ExperimentalDetectronGenerateProposalsSingleImage) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h new file mode 100644 index 00000000000000..b2f5f0bcd89fe1 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_generate_proposals_single_image_node.h @@ -0,0 +1,50 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronGenerateProposalsSingleImageNode(const std::shared_ptr& op, + const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + // Inputs: + // rois, shape [n, 4] + // rois_probs, shape [n] + // Outputs: + // top_rois, shape [max_rois, 4] + + const int INPUT_IM_INFO {0}; + const int INPUT_ANCHORS {1}; + const int INPUT_DELTAS {2}; + const int INPUT_SCORES {3}; + const int OUTPUT_ROIS {0}; + const int OUTPUT_SCORES {1}; + + float min_size_; + int pre_nms_topn_; + int post_nms_topn_; + float nms_thresh_; + float coordinates_offset; + + std::vector roi_indices_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp new file mode 100644 index 00000000000000..b5d073a0b3552e --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_experimental_detectron_priorgridgenerator_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNExperimentalDetectronPriorGridGeneratorNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto priorGridGen = std::dynamic_pointer_cast(op); + if (!priorGridGen) { + errorMessage = "Only opset6 ExperimentalDetectronPriorGridGenerator operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronPriorGridGeneratorNode::MKLDNNExperimentalDetectronPriorGridGeneratorNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "ExperimentalDetectronPriorGridGenerator layer with name '" + op->get_friendly_name() + "'"; + const auto priorGridGen = std::dynamic_pointer_cast(op); + if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + if (op->get_input_shape(INPUT_PRIORS).size() != 2 || + op->get_input_shape(INPUT_FEATUREMAP).size() != 4 || + op->get_input_shape(INPUT_IMAGE).size() != 4) + IE_THROW() << errorPrefix << " has unsupported input shape"; + + const auto &attr = priorGridGen->get_attrs(); + grid_w_ = attr.w; + grid_h_ = attr.h; + stride_h_ = attr.stride_y; + stride_w_ = attr.stride_x; +} + +void MKLDNNExperimentalDetectronPriorGridGeneratorNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronPriorGridGeneratorNode::execute(mkldnn::stream strm) { + const int num_priors_ = getParentEdgeAt(INPUT_PRIORS)->getDims()[0]; + assert(getParentEdgeAt(INPUT_PRIORS)->getDims()[1] == 4); + + // Execute + const int layer_width = grid_w_ ? grid_w_ : getParentEdgeAt(INPUT_FEATUREMAP)->getDims()[3]; + const int layer_height = grid_h_ ? grid_h_ : getParentEdgeAt(INPUT_FEATUREMAP)->getDims()[2]; + const float step_w = stride_w_ ? stride_w_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getDims()[3]) / layer_width; + const float step_h = stride_h_ ? stride_h_ : static_cast(getParentEdgeAt(INPUT_IMAGE)->getDims()[2]) / layer_height; + + const auto *bottom_data_0 = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + auto *top_data_0 = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + + for (int h = 0; h < layer_height; ++h) { + for (int w = 0; w < layer_width; ++w) { + for (int s = 0; s < num_priors_; ++s) { + top_data_0[0] = bottom_data_0[4 * s + 0] + step_w * (w + 0.5f); + top_data_0[1] = bottom_data_0[4 * s + 1] + step_h * (h + 0.5f); + top_data_0[2] = bottom_data_0[4 * s + 2] + step_w * (w + 0.5f); + top_data_0[3] = bottom_data_0[4 * s + 3] + step_h * (h + 0.5f); + top_data_0 += 4; + } + } + } +} + +bool MKLDNNExperimentalDetectronPriorGridGeneratorNode::created() const { + return getType() == ExperimentalDetectronPriorGridGenerator; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronPriorGridGeneratorNode, ExperimentalDetectronPriorGridGenerator) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h new file mode 100644 index 00000000000000..9ef117f44e65f7 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_priorgridgenerator_node.h @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronPriorGridGeneratorNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronPriorGridGeneratorNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + // Inputs: + // priors, shape [n, 4] + // [feature_map], shape [b, c, h, w] + // [im_data], shape [b, 3, im_h, im_w] + // Outputs: + // priors_grid, shape [m, 4] + + const int INPUT_PRIORS {0}; + const int INPUT_FEATUREMAP {1}; + const int INPUT_IMAGE {2}; + + const int OUTPUT_ROIS {0}; + + int grid_w_; + int grid_h_; + float stride_w_; + float stride_h_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp new file mode 100644 index 00000000000000..94e7f033a95548 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.cpp @@ -0,0 +1,413 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "common/cpu_memcpy.h" +#include "mkldnn_experimental_detectron_roifeatureextractor_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +// implementation taken from Caffe2 +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc.at(pre_calc_index) = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high = 0; + int x_high = 0; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = static_cast(1) - ly, hx = static_cast(1) - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward_cpu_kernel( + const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + const bool aligned, + T* top_data) { + int roi_cols = 4; + + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + parallel_for(n_rois, [&](size_t n) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = static_cast(offset_bottom_rois[0]); + offset_bottom_rois++; + } + + T offset = aligned ? (T)0.5 : (T)0.0; + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale - offset; + T roi_start_h = offset_bottom_rois[1] * spatial_scale - offset; + T roi_end_w = offset_bottom_rois[2] * spatial_scale - offset; + T roi_end_h = offset_bottom_rois[3] * spatial_scale - offset; + + // Force malformed ROIs to be 1x1 + T roi_width = (std::max)(roi_end_w - roi_start_w, (T)1.); + T roi_height = (std::max)(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : static_cast(ceil(roi_height / pooled_height)); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : static_cast(ceil(roi_width / pooled_width)); + + // We do average (integral) pooling inside a bin + const T count = static_cast(roi_bin_grid_h * roi_bin_grid_w); // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + }); +} + + +void redistribute_rois(const float* rois, int* level_ids, + const int num_rois, const int levels_num) { + const float canonical_scale = 224.0f; + const int canonical_level = 2; + + for (int i = 0; i < num_rois; ++i) { + const float x0 = rois[4 * i + 0]; + const float y0 = rois[4 * i + 1]; + const float x1 = rois[4 * i + 2]; + const float y1 = rois[4 * i + 3]; + + int target_level = levels_num; + float area = (x1 - x0) * (y1 - y0); + if (area > 0) { + area = std::sqrt(area) / canonical_scale; + area = std::log2(area + 1e-6f); + target_level = static_cast(std::floor(area + canonical_level)); + target_level = (std::max)(0, (std::min)(levels_num - 1, target_level)); + } + + level_ids[i] = target_level; + } +} + + +void reord(const float* src_data, const int* ranks, const int n, const int step, float* dst_data, + int* dst_mapping) { + std::iota(dst_mapping, dst_mapping + n, 0); + std::sort(dst_mapping, dst_mapping + n, [&ranks](size_t i1, size_t i2) {return ranks[i1] < ranks[i2];}); + for (int i = 0; i < n; ++i) { + const int j = dst_mapping[i]; + assert(0 <= j && j < n); + cpu_memcpy(dst_data + i * step, src_data + j * step, sizeof(float) * step); + } +} + +void split_points(const std::vector& ids, std::vector& rois_per_level, const int levels_num) { + rois_per_level.clear(); + rois_per_level.resize(levels_num, 0); + for (size_t i = 0; i < ids.size(); ++i) { + assert(0 <= ids[i] && ids[i] < levels_num); + rois_per_level[ids[i]]++; + } + for (int i = 1; i < levels_num; ++i) { + rois_per_level[i] += rois_per_level[i - 1]; + } + rois_per_level.insert(rois_per_level.begin(), 0); +} + + +void reorder_rois(const float *rois, const int* ids, int* mapping, const int rois_num, + float * reordered_rois, std::vector& rois_per_level, const int levels_num) { + rois_per_level.clear(); + rois_per_level.resize(levels_num, 0); + for (int i = 0; i < rois_num; ++i) { + assert(0 <= ids[i] && ids[i] < levels_num); + rois_per_level[ids[i]]++; + } + for (int i = 1; i < levels_num; ++i) { + rois_per_level[i] += rois_per_level[i - 1]; + } + rois_per_level.insert(rois_per_level.begin(), 0); + + std::vector level_counter = rois_per_level; + + for (int i = 0; i < rois_num; ++i) { + const int level = ids[i]; + assert(level < levels_num); + const int j = level_counter[level]; + assert(0 <= j && j < rois_num); + reordered_rois[j * 4 + 0] = rois[i * 4 + 0]; + reordered_rois[j * 4 + 1] = rois[i * 4 + 1]; + reordered_rois[j * 4 + 2] = rois[i * 4 + 2]; + reordered_rois[j * 4 + 3] = rois[i * 4 + 3]; + level_counter[level]++; + } +} + +bool MKLDNNExperimentalDetectronROIFeatureExtractorNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); + if (!roiFeatureExtractor) { + errorMessage = "Only opset6 ExperimentalDetectronROIFeatureExtractor operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronROIFeatureExtractorNode::MKLDNNExperimentalDetectronROIFeatureExtractorNode + (const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + const auto roiFeatureExtractor = std::dynamic_pointer_cast(op); + const auto &attr = roiFeatureExtractor->get_attrs(); + output_dim_ = attr.output_size; + pyramid_scales_ = attr.pyramid_scales; + sampling_ratio_ = attr.sampling_ratio; + aligned_ = attr.aligned; + pooled_height_ = output_dim_; + pooled_width_ = output_dim_; +} + +void MKLDNNExperimentalDetectronROIFeatureExtractorNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + + addSupportedPrimDesc(inDataConf, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronROIFeatureExtractorNode::execute(mkldnn::stream strm) { + const int levels_num = inDims.size() - INPUT_FEATURES_START; + const int num_rois = getParentEdgeAt(INPUT_ROIS)->getDims()[0]; + const int channels_num = getParentEdgeAt(INPUT_FEATURES_START)->getDims()[1]; + const int feaxels_per_roi = pooled_height_ * pooled_width_ * channels_num; + + auto *input_rois = reinterpret_cast(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr()); + auto *output_rois_features = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROI_FEATURES)[0]->getMemoryPtr()->GetPtr()); + float *output_rois = nullptr; + if (OUTPUT_ROIS < outDims.size()) { + output_rois = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + } + + std::vector level_ids(num_rois, 0); + redistribute_rois(input_rois, reinterpret_cast(&level_ids[0]), num_rois, levels_num); + + std::vector reordered_rois(4 * num_rois, 0); + std::vector original_rois_mapping(num_rois, 0); + reord(input_rois, &level_ids[0], num_rois, 4, &reordered_rois[0], &original_rois_mapping[0]); + + std::vector rois_per_level; + split_points(level_ids, rois_per_level, levels_num + 1); + + std::vector output_rois_features_temp(feaxels_per_roi * num_rois, 0); + for (int i = 0; i < levels_num; ++i) { + const int level_rois_offset = rois_per_level[i]; + const int level_rois_num = rois_per_level[i + 1] - level_rois_offset; + if (level_rois_num > 0) { + auto *featuremap = reinterpret_cast(getParentEdgeAt(INPUT_FEATURES_START + i)->getMemoryPtr()->GetPtr()); + const int featuremap_height = getParentEdgeAt(INPUT_FEATURES_START + i)->getDims()[2]; + const int featuremap_width = getParentEdgeAt(INPUT_FEATURES_START + i)->getDims()[3]; + ROIAlignForward_cpu_kernel(feaxels_per_roi * level_rois_num, + featuremap, + 1.0f / pyramid_scales_[i], + channels_num, + featuremap_height, + featuremap_width, + pooled_height_, + pooled_width_, + sampling_ratio_, + &reordered_rois[4 * level_rois_offset], + aligned_, + &output_rois_features_temp[feaxels_per_roi * level_rois_offset]); + } + } + + std::vector dummy_mapping(num_rois, 0); + reord(&output_rois_features_temp[0], &original_rois_mapping[0], num_rois, feaxels_per_roi, + output_rois_features, &dummy_mapping[0]); + if (output_rois != nullptr) { + cpu_memcpy(output_rois, input_rois, 4 * num_rois * sizeof(float)); + } +} + +bool MKLDNNExperimentalDetectronROIFeatureExtractorNode::created() const { + return getType() == ExperimentalDetectronROIFeatureExtractor; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronROIFeatureExtractorNode, ExperimentalDetectronROIFeatureExtractor) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h new file mode 100644 index 00000000000000..bfcb9061f26fbe --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_roifeatureextractor_node.h @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronROIFeatureExtractorNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronROIFeatureExtractorNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const int INPUT_ROIS {0}; + const int INPUT_FEATURES_START {1}; + + const int OUTPUT_ROI_FEATURES {0}; + const int OUTPUT_ROIS {1}; + + int output_dim_ = 0; + int pooled_height_ = 0; + int pooled_width_ = 0; + std::vector pyramid_scales_; + int sampling_ratio_ = 0; + bool aligned_ = false; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp new file mode 100644 index 00000000000000..d543658f78e724 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "common/cpu_memcpy.h" +#include "mkldnn_experimental_detectron_topkrois_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNExperimentalDetectronTopKROIsNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto topKROI = std::dynamic_pointer_cast(op); + if (!topKROI) { + errorMessage = "Only opset6 ExperimentalDetectronTopKROIs operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNExperimentalDetectronTopKROIsNode::MKLDNNExperimentalDetectronTopKROIsNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "ExperimentalDetectronTopKROIs layer with name '" + op->get_friendly_name() + "'"; + const auto topKROI = std::dynamic_pointer_cast(op); + if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + if (op->get_input_shape(INPUT_ROIS).size() != 2 || op->get_input_shape(INPUT_PROBS).size() != 1) + IE_THROW() << errorPrefix << " has nsupported input shape"; + + max_rois_num_ = topKROI->get_max_rois(); +} + +void MKLDNNExperimentalDetectronTopKROIsNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNExperimentalDetectronTopKROIsNode::execute(mkldnn::stream strm) { + const int input_rois_num = getParentEdgeAt(INPUT_ROIS)->getDims()[0]; + const int top_rois_num = (std::min)(max_rois_num_, input_rois_num); + + auto *input_rois = reinterpret_cast(getParentEdgeAt(INPUT_ROIS)->getMemoryPtr()->GetPtr()); + auto *input_probs = reinterpret_cast(getParentEdgeAt(INPUT_PROBS)->getMemoryPtr()->GetPtr()); + auto *output_rois = reinterpret_cast(getChildEdgesAtPort(OUTPUT_ROIS)[0]->getMemoryPtr()->GetPtr()); + + std::vector idx(input_rois_num); + iota(idx.begin(), idx.end(), 0); + // FIXME. partial_sort is enough here. + sort(idx.begin(), idx.end(), [&input_probs](size_t i1, size_t i2) {return input_probs[i1] > input_probs[i2];}); + + for (int i = 0; i < top_rois_num; ++i) { + cpu_memcpy(output_rois + 4 * i, input_rois + 4 * idx[i], 4 * sizeof(float)); + } +} + +bool MKLDNNExperimentalDetectronTopKROIsNode::created() const { + return getType() == ExperimentalDetectronTopKROIs; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNExperimentalDetectronTopKROIsNode, ExperimentalDetectronTopKROIs) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h new file mode 100644 index 00000000000000..76171de71e473c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_experimental_detectron_topkrois_node.h @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNExperimentalDetectronTopKROIsNode : public MKLDNNNode { +public: + MKLDNNExperimentalDetectronTopKROIsNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + // Inputs: + // rois, shape [n, 4] + // rois_probs, shape [n] + // Outputs: + // top_rois, shape [max_rois, 4] + + const int INPUT_ROIS {0}; + const int INPUT_PROBS {1}; + + const int OUTPUT_ROIS {0}; + int max_rois_num_; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp similarity index 66% rename from inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp rename to inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp index b0f0aa5d327ed8..d4c5d3037962b0 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.cpp @@ -1,22 +1,22 @@ -// Copyright (C) 2020-2021 Intel Corporation +// Copyright (C) 2018-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "extract_image_patches.hpp" -#include "caseless.hpp" -#include "ie_parallel.hpp" -#include "list.hpp" -#include +#include "base.hpp" + #include #include #include + #include +#include "ie_parallel.hpp" +#include "mkldnn_extract_image_patches_node.h" +#include "list.hpp" +#include +#include "caseless.hpp" using namespace MKLDNNPlugin; - -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { +using namespace InferenceEngine; using details::CaselessEq; @@ -266,11 +266,11 @@ struct jit_extract_image_patches_kernel : public jit_uni_extract_image_patches_k align(64); L(gather_index_table); for (int32_t i = 0; i < vlen / sizeof(int32_t); i++) - dd(i * jpp.SW * jpp.dtype_size); + dd(i * jpp.SW * jpp.dtype_size); } }; -bool ExtractImagePatchesImpl::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool MKLDNNExtractImagePatchesNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { const auto extImgPatcher = std::dynamic_pointer_cast(op); if (!extImgPatcher) { @@ -292,140 +292,141 @@ bool ExtractImagePatchesImpl::isSupportedOperation(const std::shared_ptr& op) { - try { - std::string errorMessage; - if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; - } +MKLDNNExtractImagePatchesNode::MKLDNNExtractImagePatchesNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } - errorPrefix = "ExtractImagePatches layer with name '" + op->get_friendly_name() + "' "; - const auto extImgPatcher = std::dynamic_pointer_cast(op); + errorPrefix = "ExtractImagePatches layer with name '" + op->get_friendly_name() + "' "; + const auto extImgPatcher = std::dynamic_pointer_cast(op); - if (op->get_input_size() != 1 || op->get_output_size() != 1) - IE_THROW() << errorPrefix << "has incorrect number of input or output edges!" - << " Input: " << op->get_input_size() << "; Output: " << op->get_output_size(); - - if (op->get_input_shape(0).size() != 4) - IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << op->get_input_shape(0).size(); - - if (op->get_output_shape(0).size() != 4) - IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << op->get_output_shape(0).size(); - - const auto precision = details::convertPrecision(op->get_input_element_type(0)); - if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end()) - IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name(); - - auto ksizes = extImgPatcher->get_sizes(); - auto strides = extImgPatcher->get_strides(); - auto rates = extImgPatcher->get_rates(); - if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::VALID) { - _auto_pad = ExtImgPatcherPadType::VALID; - } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_LOWER) { - _auto_pad = ExtImgPatcherPadType::SAME_LOWER; - } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_UPPER) { - _auto_pad = ExtImgPatcherPadType::SAME_UPPER; - } else { - IE_THROW() << errorPrefix << "has unsupported pad type: " << extImgPatcher->get_auto_pad(); - } + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << "has incorrect number of input or output edges!" + << " Input: " << getOriginalInputsNumber() << "; Output: " << getOriginalOutputsNumber(); + + if (op->get_input_shape(0).size() != 4) + IE_THROW() << errorPrefix << "must have 4D input tensor. Actual: " << op->get_input_shape(0).size(); - if (ksizes.size() != 2 || strides.size() != 2 || rates.size() != 2) - IE_THROW() << errorPrefix << "must have the following attributes with shape {2}: sizes, strides, rates."; - _ksizes.clear(); - _strides.clear(); - _rates.clear(); - for (const auto& x : ksizes) { - if (x < 0) - IE_THROW() << "Kernel sizes must be non-negative, got '" << x << "'."; - _ksizes.push_back(static_cast(x)); + if (op->get_output_shape(0).size() != 4) + IE_THROW() << errorPrefix << "must have 4D output tensor. Actual: " << op->get_output_shape(0).size(); + + auto ksizes = extImgPatcher->get_sizes(); + auto strides = extImgPatcher->get_strides(); + auto rates = extImgPatcher->get_rates(); + if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::VALID) { + _auto_pad = ExtImgPatcherPadType::VALID; + } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_LOWER) { + _auto_pad = ExtImgPatcherPadType::SAME_LOWER; + } else if (extImgPatcher->get_auto_pad() == ngraph::op::PadType::SAME_UPPER) { + _auto_pad = ExtImgPatcherPadType::SAME_UPPER; + } else { + IE_THROW() << errorPrefix << "has unsupported pad type: " << extImgPatcher->get_auto_pad(); + } + + if (ksizes.size() != 2 || strides.size() != 2 || rates.size() != 2) + IE_THROW() << errorPrefix << "must have the following attributes with shape {2}: sizes, strides, rates."; + _ksizes.clear(); + _strides.clear(); + _rates.clear(); + for (const auto& x : ksizes) { + if (x < 0) + IE_THROW() << "Kernel sizes must be non-negative, got '" << x << "'."; + _ksizes.push_back(static_cast(x)); + } + for (const auto& x : strides) { + if (x < 0) + IE_THROW() << "Strides must be non-negative, got '" << x << "'."; + _strides.push_back(static_cast(x)); + } + for (const auto& x : rates) { + if (x < 0) + IE_THROW() << "Rates must be non-negative, got '" << x << "'."; + _rates.push_back(static_cast(x)); + } + + SizeVector in_dims = op->get_input_shape(0); + _pad_left = 0; + _pad_top = 0; + jit_extract_image_patches_params jpp; + jpp.need_padding = false; + if (_auto_pad != ExtImgPatcherPadType::VALID) { + const size_t iheight = in_dims[2]; + const size_t iwidth = in_dims[3]; + const int64_t ihStep = _ksizes[0] + (_rates[0] - 1) * (_ksizes[0] - 1); + const int64_t iwStep = _ksizes[1] + (_rates[1] - 1) * (_ksizes[1] - 1); + + int64_t PW = (std::ceil(1.f * iwidth/_strides[1]) - 1) * _strides[1] + iwStep - iwidth; + int64_t PH = (std::ceil(1.f * iheight/_strides[0]) - 1) * _strides[0] + ihStep - iheight; + + int64_t increment_sign = 0; + if (_auto_pad == ExtImgPatcherPadType::SAME_LOWER) { + increment_sign = 1; + } else if (_auto_pad == ExtImgPatcherPadType::SAME_UPPER) { + increment_sign = -1; } - for (const auto& x : strides) { - if (x < 0) - IE_THROW() << "Strides must be non-negative, got '" << x << "'."; - _strides.push_back(static_cast(x)); + + if ((PW > 0) && (PW < iwStep)) { + _pad_left = static_cast((PW + increment_sign * (PW % 2)) / 2); + jpp.need_padding = true; } - for (const auto& x : rates) { - if (x < 0) - IE_THROW() << "Rates must be non-negative, got '" << x << "'."; - _rates.push_back(static_cast(x)); + if ((PH > 0) && (PH < ihStep)) { + _pad_top = static_cast((PH + increment_sign * (PH % 2)) / 2); + jpp.need_padding = true; } + } - SizeVector in_dims = op->get_input_shape(0); - _pad_left = 0; - _pad_top = 0; - jit_extract_image_patches_params jpp; - jpp.need_padding = false; - if (_auto_pad != ExtImgPatcherPadType::VALID) { - const size_t iheight = in_dims[2]; - const size_t iwidth = in_dims[3]; - const int64_t ihStep = _ksizes[0] + (_rates[0] - 1) * (_ksizes[0] - 1); - const int64_t iwStep = _ksizes[1] + (_rates[1] - 1) * (_ksizes[1] - 1); - - int64_t PW = (std::ceil(1.f * iwidth/_strides[1]) - 1) * _strides[1] + iwStep - iwidth; - int64_t PH = (std::ceil(1.f * iheight/_strides[0]) - 1) * _strides[0] + ihStep - iheight; - - int64_t increment_sign = 0; - if (_auto_pad == ExtImgPatcherPadType::SAME_LOWER) { - increment_sign = 1; - } else if (_auto_pad == ExtImgPatcherPadType::SAME_UPPER) { - increment_sign = -1; - } + jpp.IW = in_dims[3]; + SizeVector out_dims = op->get_output_shape(0); + jpp.OH = out_dims[2]; + jpp.OW = out_dims[3]; + jpp.KH = _ksizes[0]; + jpp.KW = _ksizes[1]; + jpp.SH = _strides[0]; + jpp.SW = _strides[1]; + jpp.dtype_size = getOriginalInputPrecisionAtPort(0).size(); + jpp.block_size = 1; + + if (mayiuse(x64::avx512_common)) { + jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; + extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); + } else if (mayiuse(x64::avx2)) { + jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; + extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); + } else if (mayiuse(x64::sse41)) { + jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; + extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); + } - if ((PW > 0) && (PW < iwStep)) { - _pad_left = static_cast((PW + increment_sign * (PW % 2)) / 2); - jpp.need_padding = true; - } - if ((PH > 0) && (PH < ihStep)) { - _pad_top = static_cast((PH + increment_sign * (PH % 2)) / 2); - jpp.need_padding = true; - } - } + if (extract_image_patches_kernel) + extract_image_patches_kernel->create_ker(); +} - jpp.IW = in_dims[3]; - SizeVector out_dims = op->get_output_shape(0); - jpp.OH = out_dims[2]; - jpp.OW = out_dims[3]; - jpp.KH = _ksizes[0]; - jpp.KW = _ksizes[1]; - jpp.SH = _strides[0]; - jpp.SW = _strides[1]; - jpp.dtype_size = precision.size(); - jpp.block_size = 1; - - if (mayiuse(x64::avx512_common)) { - jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; - extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); - } else if (mayiuse(x64::avx2)) { - jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; - extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); - } else if (mayiuse(x64::sse41)) { - jpp.block_size = cpu_isa_traits::vlen / jpp.dtype_size; - extract_image_patches_kernel.reset(new jit_extract_image_patches_kernel(jpp)); - } +void MKLDNNExtractImagePatchesNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; - if (extract_image_patches_kernel) - extract_image_patches_kernel->create_ker(); + precision = getOriginalInputPrecisionAtPort(0); + if (_supported_precisions_sizes.find(precision.size()) == _supported_precisions_sizes.end()) + IE_THROW() << errorPrefix << "has unsupported precision: " << precision.name(); - addConfig(op, {{TensorDescCreatorTypes::ncsp, precision}}, - {{TensorDescCreatorTypes::ncsp, precision}}); - } catch (InferenceEngine::Exception &ex) { - errorMsg = ex.what(); - } + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}}, + {{TensorDescCreatorTypes::ncsp, precision}}, + impl_desc_type::ref_any); } -StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std::vector& outputs, ResponseDesc *resp) noexcept { - const char *src_data = inputs[0]->cbuffer().as() + - inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - char *dst_data = outputs[0]->buffer().as() + - outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding(); - const size_t dtype_size = inputs[0]->getTensorDesc().getPrecision().size(); +void MKLDNNExtractImagePatchesNode::execute(mkldnn::stream strm) { + const char *src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + char *dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + const size_t dtype_size = getOriginalInputPrecisionAtPort(0).size(); - const auto& inDims = inputs[0]->getTensorDesc().getDims(); + const auto& inDims = getParentEdgeAt(0)->getDims().ToSizeVector(); const size_t IC = inDims[1]; const size_t IH = inDims[2]; const size_t IW = inDims[3]; - const auto& outDims = outputs[0]->getTensorDesc().getDims(); + const auto& outDims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector(); const size_t OB = outDims[0]; const size_t OH = outDims[2]; const size_t OW = outDims[3]; @@ -435,8 +436,8 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: const size_t RH = _rates[0], RW = _rates[1]; const size_t PT = _pad_top, PL = _pad_left; - const std::vector istrides = inputs[0]->getTensorDesc().getBlockingDesc().getStrides(); - const std::vector ostrides = outputs[0]->getTensorDesc().getBlockingDesc().getStrides(); + const std::vector istrides = getParentEdgeAt(0)->getDesc().getBlockingDesc().getStrides(); + const std::vector ostrides = getChildEdgesAtPort(0)[0]->getDesc().getBlockingDesc().getStrides(); const std::vector ostrides_partial = {ostrides[0], KW * IC * ostrides[1], IC * ostrides[1], ostrides[1]}; if (extract_image_patches_kernel) { @@ -471,7 +472,7 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: const size_t iw_hpad = std::ceil((IW - 1.f * iw_start) / SW) > OW ? OW : std::ceil((IW - 1.f * iw_start) / SW); char *my_dst_ptr = dst_data + - (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * dtype_size; + (ob * ostrides_partial[0] + kh * ostrides_partial[1] + kw * ostrides_partial[2] + ic * ostrides_partial[3]) * dtype_size; const char *my_src_ptr = src_data + (ob * istrides[0] + ic * istrides[1] + ih_start * istrides[2] + iw_start) * dtype_size; size_t num_bytes_to_set = ih_lpad * OW * dtype_size; @@ -480,14 +481,14 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: const char* src_ptr_h_stop = my_src_ptr + ih_hpad * SH * IW * dtype_size; for (const char *src_h_ptr = my_src_ptr + ih_lpad * SH * IW * dtype_size; - src_h_ptr < src_ptr_h_stop; src_h_ptr += SH * IW * dtype_size) { + src_h_ptr < src_ptr_h_stop; src_h_ptr += SH * IW * dtype_size) { num_bytes_to_set = iw_lpad * dtype_size; memset(my_dst_ptr, 0, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; const char* src_ptr_w_stop = src_h_ptr + iw_hpad * SW * dtype_size; for (const char* src_w_ptr = src_h_ptr + iw_lpad * SW * dtype_size; - src_w_ptr < src_ptr_w_stop; src_w_ptr += SW * dtype_size) { + src_w_ptr < src_ptr_w_stop; src_w_ptr += SW * dtype_size) { num_bytes_to_set = dtype_size; memcpy(my_dst_ptr, src_w_ptr, num_bytes_to_set); my_dst_ptr += num_bytes_to_set; @@ -500,11 +501,12 @@ StatusCode ExtractImagePatchesImpl::execute(std::vector& inputs, std: memset(my_dst_ptr, 0, num_bytes_to_set); }); } - return OK; } -const std::set ExtractImagePatchesImpl::_supported_precisions_sizes = {1, 2, 4}; +const std::set MKLDNNExtractImagePatchesNode::_supported_precisions_sizes = {1, 2, 4}; + +bool MKLDNNExtractImagePatchesNode::created() const { + return getType() == ExtractImagePatches; +} -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine +REG_MKLDNN_PRIM_FOR(MKLDNNExtractImagePatchesNode, ExtractImagePatches) diff --git a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h similarity index 64% rename from inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp rename to inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h index 8ed62fbca89b0d..2990b12d08f2e3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/extract_image_patches.hpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_extract_image_patches_node.h @@ -1,16 +1,16 @@ -// Copyright (C) 2021 Intel Corporation +// Copyright (C) 2018-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // + #pragma once -#include "base.hpp" +#include +#include +#include +#include #include -#include -#include -namespace InferenceEngine { -namespace Extensions { -namespace Cpu { +namespace MKLDNNPlugin { struct jit_extract_image_patches_params { size_t IW; @@ -40,12 +40,17 @@ struct jit_uni_extract_image_patches_kernel { virtual ~jit_uni_extract_image_patches_kernel() {} }; - -class ExtractImagePatchesImpl : public ExtLayerBase { +class MKLDNNExtractImagePatchesNode : public MKLDNNNode { public: - explicit ExtractImagePatchesImpl(const std::shared_ptr& op); - StatusCode execute(std::vector&, std::vector&, ResponseDesc*) noexcept override; - bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + MKLDNNExtractImagePatchesNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; private: enum class ExtImgPatcherPadType { @@ -63,12 +68,9 @@ class ExtractImagePatchesImpl : public ExtLayerBase { static const std::set _supported_precisions_sizes; ExtImgPatcherPadType _auto_pad; + InferenceEngine::Precision precision; std::string errorPrefix; }; -REG_FACTORY_FOR(ExtractImagePatchesImpl, ExtractImagePatches); - -} // namespace Cpu -} // namespace Extensions -} // namespace InferenceEngine +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp index eabd4f52aac8b2..e3e14e356912db 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.cpp @@ -18,7 +18,7 @@ using namespace InferenceEngine; bool MKLDNNGatherElementsNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto gatherElementsOp = ngraph::as_type_ptr(op); + const auto gatherElementsOp = ngraph::as_type_ptr(op); if (!gatherElementsOp) { errorMessage = "Node is not an instance of the GatherElements operation from operation set v6."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h index 30d1fda9e9553d..bc19866768dfcf 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h @@ -32,7 +32,7 @@ class MKLDNNGatherElementsNode : public MKLDNNNode { size_t dataTypeSize_; int strideAxDst_; int dstAxDim_; - int strideAx1Diff_; + int strideAx1Diff_ = 0; std::string errorPrefix_; template diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp index 3e858dd309d8ca..ee7623f9b4810b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_nd_node.cpp @@ -18,7 +18,7 @@ using namespace InferenceEngine; bool MKLDNNGatherNDNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto gatherElementsOp = ngraph::as_type_ptr(op); + const auto gatherElementsOp = ngraph::as_type_ptr(op); if (!gatherElementsOp) { errorMessage = "Node is not an instance of the GatherND operation from operation set v5."; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp index 3bd50aadf3357e..ade92f6a4a0060 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_node.cpp @@ -15,13 +15,13 @@ using namespace InferenceEngine; bool MKLDNNGatherNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - auto gatherOp = ngraph::as_type_ptr(op); + const auto gatherOp = ngraph::as_type_ptr(op); if (!gatherOp) { errorMessage = "Only opset7 Gather operation is supported"; return false; } - auto axesOp = gatherOp->get_input_node_shared_ptr(GATHER_AXIS); + const auto axesOp = gatherOp->get_input_node_shared_ptr(GATHER_AXIS); if (!ngraph::as_type_ptr(axesOp)) { errorMessage = "Only Constant operation on 'axis' input is supported"; return false; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp new file mode 100644 index 00000000000000..ce396446df2418 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.cpp @@ -0,0 +1,148 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_gather_tree_node.h" +#include + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNGatherTreeNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto gatherElementsOp = ngraph::as_type_ptr(op); + if (!gatherElementsOp) { + errorMessage = "Node is not an instance of the GatherTree operation from operation set v1."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNGatherTreeNode::MKLDNNGatherTreeNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = std::string("Node GatherTree with name '") + op->get_friendly_name() + "'"; + if (op->get_input_size() != 4) + IE_THROW() << errorPrefix << " has incorrect number of input edges."; + if (op->get_output_size() != 1) + IE_THROW() << errorPrefix << " has incorrect number of output edges."; + + if (op->get_input_shape(GATHER_TREE_STEP_IDX).size() != 3) + IE_THROW() << errorPrefix << " step_idx vector should be 3 dimension"; + if (op->get_input_shape(GATHER_TREE_PARENT_IDX).size() != 3) + IE_THROW() << errorPrefix << " parent_idx vector should be 3 dimension"; + if (op->get_input_shape(GATHER_TREE_MAX_SEQ_LEN).size() != 1) + IE_THROW() << errorPrefix << " max_seq_len vector should be 1 dimension"; + if (op->get_input_shape(GATHER_TREE_END_TOKEN).size() != 0) + IE_THROW() << errorPrefix << " end_token should be 1 dimension"; +} + +void MKLDNNGatherTreeNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + precision = getOriginalInputPrecisionAtPort(GATHER_TREE_STEP_IDX); + if (!MKLDNNPlugin::one_of(precision, Precision::FP32, Precision::I32)) + precision = Precision::FP32; + + if (getOriginalInputPrecisionAtPort(GATHER_TREE_PARENT_IDX) != precision || + getOriginalInputPrecisionAtPort(GATHER_TREE_MAX_SEQ_LEN) != precision || + getOriginalInputPrecisionAtPort(GATHER_TREE_END_TOKEN) != precision || + getOriginalOutputPrecisionAtPort(0) != precision) { + IE_THROW() << errorPrefix << " has incorrect input/output data precision. Must be the same."; + } + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}, + {TensorDescCreatorTypes::ncsp, precision}, + {TensorDescCreatorTypes::ncsp, precision}, + {TensorDescCreatorTypes::ncsp, precision}}, + {{TensorDescCreatorTypes::ncsp, precision}}, + impl_desc_type::ref_any); +} + +void MKLDNNGatherTreeNode::execute(mkldnn::stream strm) { + if (precision == Precision::FP32) + return gatherTreeKernel(); + else + return gatherTreeKernel(); +} + +template +void MKLDNNGatherTreeNode::gatherTreeKernel() noexcept { + const auto *step_idx = reinterpret_cast(getParentEdgeAt(GATHER_TREE_STEP_IDX)->getMemoryPtr()->GetPtr()); + const auto * const parent_idx = reinterpret_cast(getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getMemoryPtr()->GetPtr()); + const size_t parent_idx_size = getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDims().size() + - getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDesc().getBlockingDesc().getOffsetPadding(); + const auto *max_seq_len = reinterpret_cast(getParentEdgeAt(GATHER_TREE_MAX_SEQ_LEN)->getMemoryPtr()->GetPtr()); + auto end_token = (reinterpret_cast(getParentEdgeAt(GATHER_TREE_END_TOKEN)->getMemoryPtr()->GetPtr()))[0]; + auto * final_idx = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + SizeVector step_idx_dims = getParentEdgeAt(GATHER_TREE_STEP_IDX)->getDims().ToSizeVector(); + SizeVector parent_idx_dims = getParentEdgeAt(GATHER_TREE_PARENT_IDX)->getDims().ToSizeVector(); + SizeVector max_seq_len_dims = getParentEdgeAt(GATHER_TREE_MAX_SEQ_LEN)->getDims().ToSizeVector(); + SizeVector final_idx_dims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector(); + int32_t max_time = step_idx_dims[0]; + const size_t batch_size = step_idx_dims[1]; + const size_t beam_width = step_idx_dims[2]; + const size_t bb_size = batch_size * beam_width; + + if (max_time != static_cast(parent_idx_dims[0]) || max_time != static_cast(final_idx_dims[0]) || + batch_size != parent_idx_dims[1] || batch_size != final_idx_dims[1] || batch_size != max_seq_len_dims[0] || + beam_width != parent_idx_dims[2] || beam_width != final_idx_dims[2]) { + std::string errorMsg = "Input/Output tensors dimensions mismatch"; + IE_THROW() << errorMsg; + } + + bool incorrect_result = false; + parallel_for2d(batch_size, beam_width, [&](size_t batch, size_t beam) { + int32_t max_sequence_in_beam = std::min(max_time, static_cast(max_seq_len[batch])); + if (max_sequence_in_beam > 0) { + int32_t time, idx = (max_time - 1) * bb_size + batch * beam_width; + for (time = (max_time - 1); time >= max_sequence_in_beam; time--, idx -= bb_size) + final_idx[idx + beam] = end_token; + + for (int32_t parent = static_cast(beam); time >= 0; time--, idx -= bb_size) { + if (parent < 0 || parent >= static_cast(beam_width) || idx + parent >= parent_idx_size) { + incorrect_result = true; + break; + } + final_idx[idx + beam] = step_idx[idx + parent]; + parent = static_cast(parent_idx[idx + parent]); + } + + bool finished = false; + auto *final = &final_idx[batch * beam_width + beam]; + for (time = 0; time < max_sequence_in_beam; time++, final += bb_size) { + if (finished) + (*final) = end_token; + else if ((*final) == end_token) + finished = true; + } + } + }); + + if (incorrect_result) { + std::string errorMsg = "Wrong parent index, result is incorrect"; + IE_THROW() << errorMsg; + } +} + +bool MKLDNNGatherTreeNode::created() const { + return getType() == GatherTree; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNGatherTreeNode, GatherTree) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h new file mode 100644 index 00000000000000..63f34fe6d6e685 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_tree_node.h @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNGatherTreeNode : public MKLDNNNode { +public: + MKLDNNGatherTreeNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + template + void gatherTreeKernel() noexcept; + + private: + static const size_t GATHER_TREE_STEP_IDX = 0; + static const size_t GATHER_TREE_PARENT_IDX = 1; + static const size_t GATHER_TREE_MAX_SEQ_LEN = 2; + static const size_t GATHER_TREE_END_TOKEN = 3; + + InferenceEngine::Precision precision; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp new file mode 100644 index 00000000000000..0dbe8dee59ea51 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_grn_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNGRNNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto grn = std::dynamic_pointer_cast(op); + if (!grn) { + errorMessage = "Only opset1 GRN operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNGRNNode::MKLDNNGRNNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "GRN layer with name '" + op->get_friendly_name() + "'"; + const auto grn = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + bias = grn->get_bias(); +} + +void MKLDNNGRNNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32, false, 0}}, + impl_desc_type::ref_any); +} + +void MKLDNNGRNNode::execute(mkldnn::stream strm) { + const float* src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + float* dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + SizeVector dims = getParentEdgeAt(0)->getDims().ToSizeVector(); + + int N = static_cast((dims.size() > 0) ? dims[0] : 1); + int C = static_cast((dims.size() > 1) ? dims[1] : 1); + int H = static_cast((dims.size() > 2) ? dims[2] : 1); + int W = static_cast((dims.size() > 3) ? dims[3] : 1); + + parallel_for3d(N, H, W, [&](int b, int h, int w) { + double variance = 0; + for (int c = 0; c < C; c++) { + variance += std::pow(src_data[b*C*H*W + c*H*W + h*W + w], 2); + } + variance = std::pow(variance + bias, 0.5f); + for (int c = 0; c < C; c++) { + dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] / static_cast(variance); + } + }); +} + +bool MKLDNNGRNNode::created() const { + return getType() == GRN; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNGRNNode, GRN) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h new file mode 100644 index 00000000000000..8fe8d9d75b04e7 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_grn_node.h @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNGRNNode : public MKLDNNNode { +public: + MKLDNNGRNNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + float bias = 1.0f; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp new file mode 100644 index 00000000000000..5750f8517b0096 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.cpp @@ -0,0 +1,116 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_log_softmax_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNLogSoftmaxNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto logSoftMax = std::dynamic_pointer_cast(op); + if (!logSoftMax) { + errorMessage = "Only opset5 LogSoftmax operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNLogSoftmaxNode::MKLDNNLogSoftmaxNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "LogSoftmax layer with name '" + op->get_friendly_name() + "'"; + const auto logSoftMax = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + SizeVector dims = op->get_input_shape(0); + if (!dims.size()) + dims = SizeVector(1, 1); + int axis = logSoftMax->get_axis(); + if (axis < 0) + axis += dims.size(); + + if (dims.size() < static_cast((size_t)(1) + axis)) + IE_THROW() << errorPrefix << " has incorrect input parameters dimensions and axis number!"; + + int j; + for (j = dims.size() - 1; j >= 0; j--) { + if (dims[j] != 1) break; + } + if (j == axis) isLastDim = true; + + for (int i = 0; i < axis; i++) + axisStep *= dims[i]; + reducedAxisSize = dims[axis]; + for (size_t i = (axis + 1); i < dims.size(); i++) + reducedAxisStride *= dims[i]; +} + +void MKLDNNLogSoftmaxNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNLogSoftmaxNode::execute(mkldnn::stream strm) { + const float *srcData = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + float* dstData = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + if (isLastDim) { + parallel_for(axisStep, [&](size_t i) { + const float *srcDataPtr = &srcData[i * reducedAxisSize]; + float *dstDataPtr = &dstData[i * reducedAxisSize]; + + float reduceProd = 0.0f; + const float max = *std::max_element(srcDataPtr, srcDataPtr + reducedAxisSize); + for (size_t j = 0; j < reducedAxisSize; ++j) + reduceProd += expf(srcDataPtr[j] - max); + + reduceProd = logf(reduceProd); + for (size_t j = 0; j < reducedAxisSize; ++j) + dstDataPtr[j] = srcDataPtr[j] - max - reduceProd; + }); + } else { + parallel_for2d(axisStep, reducedAxisStride, [&](size_t k, size_t i) { + const float *srcDataPtr = &srcData[k * reducedAxisStride * reducedAxisSize + i]; + float *dstDataPtr = &dstData[k * reducedAxisStride * reducedAxisSize + i]; + + float reduceProd = 0.0f; + float max = std::numeric_limits::min(); + for (size_t j = 0; j < reducedAxisSize; ++j) { + if (srcDataPtr[j * reducedAxisStride] > max) + max = srcDataPtr[j * reducedAxisStride]; + } + + for (size_t j = 0; j < reducedAxisSize; ++j) + reduceProd += expf(srcDataPtr[j * reducedAxisStride] - max); + + reduceProd = logf(reduceProd); + for (size_t j = 0; j < reducedAxisSize; ++j) + dstDataPtr[j * reducedAxisStride] = srcDataPtr[j * reducedAxisStride] - max - reduceProd; + }); + } +} + +bool MKLDNNLogSoftmaxNode::created() const { + return getType() == LogSoftmax; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNLogSoftmaxNode, LogSoftmax) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h new file mode 100644 index 00000000000000..456d7321efcdc4 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_log_softmax_node.h @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNLogSoftmaxNode : public MKLDNNNode { +public: + MKLDNNLogSoftmaxNode(const std::shared_ptr& op, + const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + size_t reducedAxisSize; + size_t reducedAxisStride = 1; + size_t axisStep = 1; + bool isLastDim = false; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp index ecfa4fbbd32468..908686bf6df1eb 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.cpp @@ -43,17 +43,17 @@ MKLDNNMathNode::MKLDNNMathNode(const std::shared_ptr& op, const mk } initializers[op->get_type_info()](op, *this); - - size_t sizeVector = op->get_input_size(); - inDataConf.reserve(sizeVector); - for (int i = 0; i < sizeVector; ++i) - inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); } void MKLDNNMathNode::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + addSupportedPrimDesc(inDataConf, {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, impl_desc_type::ref_any); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h index a91cb3ae373d9c..28260dc476ec54 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_math_node.h @@ -28,7 +28,6 @@ class MKLDNNMathNode : public MKLDNNNode { float beta = 0.0f; float gamma = 0.0f; - std::vector inDataConf; std::string errorPrefix; }; diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index f27a40e3bd255a..baff79e5d75317 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -306,8 +306,8 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k inline void worker_tail_planar() { Precision dst_prc = isFloatCompatible(jcp_.src_prc) ? Precision::FP32 : Precision::I32; load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, - std::make_shared(jcp_.src_prc, dst_prc, tail_num, true, "zero"), - {}, {load_pool_gpr_idxs}); + std::make_shared(jcp_.src_prc, dst_prc, tail_num, 0, true), + {}, {load_pool_gpr_idxs}); if (jcp_.normalize_variance) { if (!isFloatCompatible(jcp_.src_prc)) @@ -477,8 +477,7 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator this->postamble(); load_emitter->emit_data(); - if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core) && store_emitter != nullptr && store_emitter->get_emu_vcvtneps2bf16() != nullptr) - store_emitter->get_emu_vcvtneps2bf16()->emit_data(); + store_emitter->emit_data(); for (auto& inj : eltwise_injectors) inj->prepare_table(); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp new file mode 100644 index 00000000000000..093127eada5f9a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.cpp @@ -0,0 +1,406 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include + +#include "mkldnn_non_max_suppression_node.h" +#include "ie_parallel.hpp" +#include +#include "utils/general_utils.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNNonMaxSuppressionNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto nms = std::dynamic_pointer_cast(op); + if (!nms) { + errorMessage = "Only internal NonMaxSuppression operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNNonMaxSuppressionNode::MKLDNNNonMaxSuppressionNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "NMS layer with name '" + op->get_friendly_name() + "' "; + const auto nms = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 6) + IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getOriginalInputsNumber(); + + if (getOriginalOutputsNumber() < 1 || getOriginalOutputsNumber() > 3) + IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getOriginalOutputsNumber(); + + boxEncodingType = nms->m_center_point_box ? boxEncoding::CENTER : boxEncoding::CORNER; + + sort_result_descending = nms->m_sort_result_descending; + + const SizeVector &boxes_dims = op->get_input_shape(NMS_BOXES); + num_batches = boxes_dims[0]; + num_boxes = boxes_dims[1]; + if (boxes_dims.size() != 3) + IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size(); + if (boxes_dims[2] != 4) + IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2]; + + const SizeVector &scores_dims = op->get_input_shape(NMS_SCORES); + num_classes = scores_dims[1]; + if (scores_dims.size() != 3) + IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size(); + + if (num_batches != scores_dims[0]) + IE_THROW() << errorPrefix << " num_batches is different in 'boxes' and 'scores' inputs"; + if (num_boxes != scores_dims[2]) + IE_THROW() << errorPrefix << " num_boxes is different in 'boxes' and 'scores' inputs"; + + numFiltBox.resize(num_batches); + for (auto & i : numFiltBox) + i.resize(num_classes); + + inputShape_MAXOUTPUTBOXESPERCLASS = op->get_input_shape(NMS_MAXOUTPUTBOXESPERCLASS); + inputShape_IOUTHRESHOLD = op->get_input_shape(NMS_IOUTHRESHOLD); + inputShape_SCORETHRESHOLD = op->get_input_shape(NMS_SCORETHRESHOLD); + if (getOriginalInputsNumber() > NMS_SOFTNMSSIGMA) { + inputShape_SOFTNMSSIGMA = op->get_input_shape(NMS_SOFTNMSSIGMA); + } + + outputShape_SELECTEDINDICES = op->get_output_shape(NMS_SELECTEDINDICES); + outputShape_SELECTEDSCORES = op->get_output_shape(NMS_SELECTEDSCORES); + + const SizeVector &valid_outputs_dims = op->get_input_shape(NMS_VALIDOUTPUTS); + if (valid_outputs_dims.size() != 1) + IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_dims.size(); + if (valid_outputs_dims[0] != 1) + IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_dims[1]; +} + +void MKLDNNNonMaxSuppressionNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + const std::vector supportedFloatPrecision = {Precision::FP32, Precision::BF16}; + const std::vector supportedIntOutputPrecision = {Precision::I32, Precision::I64}; + + checkPrecision(getOriginalInputPrecisionAtPort(NMS_BOXES), supportedFloatPrecision, "boxes", inType); + checkPrecision(getOriginalInputPrecisionAtPort(NMS_SCORES), supportedFloatPrecision, "scores", inType); + checkPrecision(getOriginalInputPrecisionAtPort(NMS_VALIDOUTPUTS), supportedIntOutputPrecision, "valid_outputs", outType); + + const std::vector supportedPrecision = {Precision::I16, Precision::U8, Precision::I8, Precision::U16, Precision::I32, + Precision::U32, Precision::I64, Precision::U64}; + + check1DInput(inputShape_MAXOUTPUTBOXESPERCLASS, supportedPrecision, "max_output_boxes_per_class", NMS_MAXOUTPUTBOXESPERCLASS); + check1DInput(inputShape_IOUTHRESHOLD, supportedFloatPrecision, "iou_threshold", NMS_IOUTHRESHOLD); + check1DInput(inputShape_SCORETHRESHOLD, supportedFloatPrecision, "score_threshold", NMS_SCORETHRESHOLD); + + if (getOriginalInputsNumber() > NMS_SOFTNMSSIGMA) { + check1DInput(inputShape_SOFTNMSSIGMA, supportedFloatPrecision, "soft_nms_sigma", NMS_SOFTNMSSIGMA); + } + + checkOutput(outputShape_SELECTEDINDICES, supportedIntOutputPrecision, "selected_indices", NMS_SELECTEDINDICES); + checkOutput(outputShape_SELECTEDSCORES, supportedFloatPrecision, "selected_scores", NMS_SELECTEDSCORES); + + std::vector inDataConf; + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) { + Precision inPrecision = i == NMS_MAXOUTPUTBOXESPERCLASS ? Precision::I32 : Precision::FP32; + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, inPrecision); + } + + std::vector outDataConf; + outDataConf.reserve(getOriginalOutputsNumber()); + for (int i = 0; i < getOriginalOutputsNumber(); ++i) { + Precision outPrecision = i == NMS_SELECTEDSCORES ? Precision::FP32 : Precision::I32; + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, outPrecision); + } + + addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any); +} + +void MKLDNNNonMaxSuppressionNode::execute(mkldnn::stream strm) { + const float *boxes = reinterpret_cast(getParentEdgeAt(NMS_BOXES)->getMemoryPtr()->GetPtr()); + const float *scores = reinterpret_cast(getParentEdgeAt(NMS_SCORES)->getMemoryPtr()->GetPtr()); + + max_output_boxes_per_class = outDims.size() > NMS_SELECTEDSCORES ? 0 : num_boxes; + if (inDims.size() > NMS_MAXOUTPUTBOXESPERCLASS) { + max_output_boxes_per_class = reinterpret_cast(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->GetPtr())[0]; + } + + if (max_output_boxes_per_class == 0) + return; + + iou_threshold = outDims.size() > NMS_SELECTEDSCORES ? 0.0f : 1.0f; + if (inDims.size() > NMS_IOUTHRESHOLD) + iou_threshold = reinterpret_cast(getParentEdgeAt(NMS_IOUTHRESHOLD)->getMemoryPtr()->GetPtr())[0]; + + score_threshold = 0.0f; + if (inDims.size() > NMS_SCORETHRESHOLD) + score_threshold = reinterpret_cast(getParentEdgeAt(NMS_SCORETHRESHOLD)->getMemoryPtr()->GetPtr())[0]; + + soft_nms_sigma = 0.0f; + if (inDims.size() > NMS_SOFTNMSSIGMA) + soft_nms_sigma = reinterpret_cast(getParentEdgeAt(NMS_SOFTNMSSIGMA)->getMemoryPtr()->GetPtr())[0]; + scale = 0.0f; + if (soft_nms_sigma > 0.0) { + scale = -0.5 / soft_nms_sigma; + } + + int *selected_indices = reinterpret_cast(getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr()->GetPtr()); + + float *selected_scores = nullptr; + if (outDims.size() > NMS_SELECTEDSCORES) + selected_scores = reinterpret_cast(getChildEdgesAtPort(NMS_SELECTEDSCORES)[0]->getMemoryPtr()->GetPtr()); + + int *valid_outputs = nullptr; + if (outDims.size() > NMS_VALIDOUTPUTS) + valid_outputs = reinterpret_cast(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->GetPtr()); + + auto boxesStrides = getParentEdgeAt(NMS_BOXES)->getDesc().getBlockingDesc().getStrides(); + auto scoresStrides = getParentEdgeAt(NMS_SCORES)->getDesc().getBlockingDesc().getStrides(); + + std::vector filtBoxes(max_output_boxes_per_class * num_batches * num_classes); + + if (soft_nms_sigma == 0.0f) { + nmsWithoutSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + } else { + nmsWithSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + } + + size_t startOffset = numFiltBox[0][0]; + for (size_t b = 0; b < numFiltBox.size(); b++) { + size_t batchOffset = b*num_classes*max_output_boxes_per_class; + for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) { + size_t offset = batchOffset + c*max_output_boxes_per_class; + for (size_t i = 0; i < numFiltBox[b][c]; i++) { + filtBoxes[startOffset + i] = filtBoxes[offset + i]; + } + startOffset += numFiltBox[b][c]; + } + } + filtBoxes.resize(startOffset); + + // need more particular comparator to get deterministic behaviour + // escape situation when filtred boxes with same score have different position from launch to launch + if (sort_result_descending) { + parallel_sort(filtBoxes.begin(), filtBoxes.end(), + [](const filteredBoxes& l, const filteredBoxes& r) { + return (l.score > r.score) || + (l.score == r.score && l.batch_index < r.batch_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index); + }); + } + + const size_t selectedBoxesNum = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getDims()[0]; + const size_t validOutputs = std::min(filtBoxes.size(), selectedBoxesNum); + + int selectedIndicesStride = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getDesc().getBlockingDesc().getStrides()[0]; + int *selectedIndicesPtr = selected_indices; + float *selectedScoresPtr = selected_scores; + + size_t idx = 0lu; + for (; idx < validOutputs; idx++) { + selectedIndicesPtr[0] = filtBoxes[idx].batch_index; + selectedIndicesPtr[1] = filtBoxes[idx].class_index; + selectedIndicesPtr[2] = filtBoxes[idx].box_index; + selectedIndicesPtr += selectedIndicesStride; + if (outDims.size() > NMS_SELECTEDSCORES) { + selectedScoresPtr[0] = static_cast(filtBoxes[idx].batch_index); + selectedScoresPtr[1] = static_cast(filtBoxes[idx].class_index); + selectedScoresPtr[2] = static_cast(filtBoxes[idx].score); + selectedScoresPtr += selectedIndicesStride; + } + } + std::fill(selectedIndicesPtr, selectedIndicesPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1); + if (outDims.size() > NMS_SELECTEDSCORES) { + std::fill(selectedScoresPtr, selectedScoresPtr + (selectedBoxesNum - idx) * selectedIndicesStride, -1.f); + } + if (outDims.size() > NMS_VALIDOUTPUTS) + *valid_outputs = static_cast(validOutputs); +} + +bool MKLDNNNonMaxSuppressionNode::created() const { + return getType() == NonMaxSuppression; +} + +float MKLDNNNonMaxSuppressionNode::intersectionOverUnion(const float *boxesI, const float *boxesJ) { + float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ; + if (boxEncodingType == boxEncoding::CENTER) { + // box format: x_center, y_center, width, height + yminI = boxesI[1] - boxesI[3] / 2.f; + xminI = boxesI[0] - boxesI[2] / 2.f; + ymaxI = boxesI[1] + boxesI[3] / 2.f; + xmaxI = boxesI[0] + boxesI[2] / 2.f; + yminJ = boxesJ[1] - boxesJ[3] / 2.f; + xminJ = boxesJ[0] - boxesJ[2] / 2.f; + ymaxJ = boxesJ[1] + boxesJ[3] / 2.f; + xmaxJ = boxesJ[0] + boxesJ[2] / 2.f; + } else { + // box format: y1, x1, y2, x2 + yminI = (std::min)(boxesI[0], boxesI[2]); + xminI = (std::min)(boxesI[1], boxesI[3]); + ymaxI = (std::max)(boxesI[0], boxesI[2]); + xmaxI = (std::max)(boxesI[1], boxesI[3]); + yminJ = (std::min)(boxesJ[0], boxesJ[2]); + xminJ = (std::min)(boxesJ[1], boxesJ[3]); + ymaxJ = (std::max)(boxesJ[0], boxesJ[2]); + xmaxJ = (std::max)(boxesJ[1], boxesJ[3]); + } + + float areaI = (ymaxI - yminI) * (xmaxI - xminI); + float areaJ = (ymaxJ - yminJ) * (xmaxJ - xminJ); + if (areaI <= 0.f || areaJ <= 0.f) + return 0.f; + + float intersection_area = + (std::max)((std::min)(ymaxI, ymaxJ) - (std::max)(yminI, yminJ), 0.f) * + (std::max)((std::min)(xmaxI, xmaxJ) - (std::max)(xminI, xminJ), 0.f); + return intersection_area / (areaI + areaJ - intersection_area); +} + +void MKLDNNNonMaxSuppressionNode::nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes) { + auto less = [](const boxInfo& l, const boxInfo& r) { + return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx)); + }; + + auto coeff = [&](float iou) { + const float weight = std::exp(scale * iou * iou); + return iou <= iou_threshold ? weight : 0.0f; + }; + + parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) { + std::vector fb; + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::priority_queue, decltype(less)> sorted_boxes(less); + for (int box_idx = 0; box_idx < num_boxes; box_idx++) { + if (scoresPtr[box_idx] > score_threshold) + sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0})); + } + + fb.reserve(sorted_boxes.size()); + if (sorted_boxes.size() > 0) { + while (fb.size() < max_output_boxes_per_class && !sorted_boxes.empty()) { + boxInfo currBox = sorted_boxes.top(); + float origScore = currBox.score; + sorted_boxes.pop(); + + bool box_is_selected = true; + for (int idx = static_cast(fb.size()) - 1; idx >= currBox.suppress_begin_index; idx--) { + float iou = intersectionOverUnion(&boxesPtr[currBox.idx * 4], &boxesPtr[fb[idx].box_index * 4]); + currBox.score *= coeff(iou); + if (iou >= iou_threshold) { + box_is_selected = false; + break; + } + if (currBox.score <= score_threshold) + break; + } + + currBox.suppress_begin_index = fb.size(); + if (box_is_selected) { + if (currBox.score == origScore) { + fb.push_back({ currBox.score, batch_idx, class_idx, currBox.idx }); + continue; + } + if (currBox.score > score_threshold) { + sorted_boxes.push(currBox); + } + } + } + } + numFiltBox[batch_idx][class_idx] = fb.size(); + size_t offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class; + for (size_t i = 0; i < fb.size(); i++) { + filtBoxes[offset + i] = fb[i]; + } + }); +} + +void MKLDNNNonMaxSuppressionNode::nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes) { + int max_out_box = static_cast(max_output_boxes_per_class); + parallel_for2d(num_batches, num_classes, [&](int batch_idx, int class_idx) { + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::vector> sorted_boxes; + for (int box_idx = 0; box_idx < num_boxes; box_idx++) { + if (scoresPtr[box_idx] > score_threshold) + sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx)); + } + + int io_selection_size = 0; + if (sorted_boxes.size() > 0) { + parallel_sort(sorted_boxes.begin(), sorted_boxes.end(), + [](const std::pair& l, const std::pair& r) { + return (l.first > r.first || ((l.first == r.first) && (l.second < r.second))); + }); + int offset = batch_idx*num_classes*max_output_boxes_per_class + class_idx*max_output_boxes_per_class; + filtBoxes[offset + 0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second); + io_selection_size++; + for (size_t box_idx = 1; (box_idx < sorted_boxes.size()) && (io_selection_size < max_out_box); box_idx++) { + bool box_is_selected = true; + for (int idx = io_selection_size - 1; idx >= 0; idx--) { + float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[box_idx].second * 4], &boxesPtr[filtBoxes[offset + idx].box_index * 4]); + if (iou >= iou_threshold) { + box_is_selected = false; + break; + } + } + + if (box_is_selected) { + filtBoxes[offset + io_selection_size] = filteredBoxes(sorted_boxes[box_idx].first, batch_idx, class_idx, sorted_boxes[box_idx].second); + io_selection_size++; + } + } + } + numFiltBox[batch_idx][class_idx] = io_selection_size; + }); +} + +void MKLDNNNonMaxSuppressionNode::checkPrecision(const Precision prec, const std::vector precList, + const std::string name, const std::string type) { + if (std::find(precList.begin(), precList.end(), prec) == precList.end()) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec; +} + +void MKLDNNNonMaxSuppressionNode::check1DInput(const SizeVector& dims, const std::vector precList, + const std::string name, const size_t port) { + checkPrecision(getOriginalInputPrecisionAtPort(port), precList, name, inType); + + if (dims.size() != 0 && dims.size() != 1) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' input rank: " << dims.size(); + if (dims.size() == 1) + if (dims[0] != 1) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' input 1st dimension size: " << dims[0]; +} + +void MKLDNNNonMaxSuppressionNode::checkOutput(const SizeVector& dims, const std::vector precList, + const std::string name, const size_t port) { + checkPrecision(getOriginalOutputPrecisionAtPort(port), precList, name, outType); + + if (dims.size() != 2) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' output rank: " << dims.size(); + if (dims[1] != 3) + IE_THROW() << errorPrefix << "has unsupported '" << name << "' output 2nd dimension size: " << dims[1]; +} + + +REG_MKLDNN_PRIM_FOR(MKLDNNNonMaxSuppressionNode, NonMaxSuppression) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h new file mode 100644 index 00000000000000..4651da1f2e795c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_non_max_suppression_node.h @@ -0,0 +1,102 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +using namespace InferenceEngine; + +namespace MKLDNNPlugin { + +class MKLDNNNonMaxSuppressionNode : public MKLDNNNode { +public: + MKLDNNNonMaxSuppressionNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + struct filteredBoxes { + float score; + int batch_index; + int class_index; + int box_index; + filteredBoxes() = default; + filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index) : + score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {} + }; + + struct boxInfo { + float score; + int idx; + int suppress_begin_index; + }; + + float intersectionOverUnion(const float *boxesI, const float *boxesJ); + + void nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes); + + void nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, + const SizeVector &scoresStrides, std::vector &filtBoxes); + +private: + // input + const size_t NMS_BOXES = 0; + const size_t NMS_SCORES = 1; + const size_t NMS_MAXOUTPUTBOXESPERCLASS = 2; + const size_t NMS_IOUTHRESHOLD = 3; + const size_t NMS_SCORETHRESHOLD = 4; + const size_t NMS_SOFTNMSSIGMA = 5; + + // output + const size_t NMS_SELECTEDINDICES = 0; + const size_t NMS_SELECTEDSCORES = 1; + const size_t NMS_VALIDOUTPUTS = 2; + + enum class boxEncoding { + CORNER, + CENTER + }; + boxEncoding boxEncodingType = boxEncoding::CORNER; + bool sort_result_descending = true; + + size_t num_batches; + size_t num_boxes; + size_t num_classes; + + size_t max_output_boxes_per_class = 0lu; + float iou_threshold = 0.0f; + float score_threshold = 0.0f; + float soft_nms_sigma = 0.0f; + float scale = 1.f; + + SizeVector inputShape_MAXOUTPUTBOXESPERCLASS; + SizeVector inputShape_IOUTHRESHOLD; + SizeVector inputShape_SCORETHRESHOLD; + SizeVector inputShape_SOFTNMSSIGMA; + + SizeVector outputShape_SELECTEDINDICES; + SizeVector outputShape_SELECTEDSCORES; + + std::string errorPrefix; + + std::vector> numFiltBox; + const std::string inType = "input", outType = "output"; + + void checkPrecision(const Precision prec, const std::vector precList, const std::string name, const std::string type); + void check1DInput(const SizeVector& dims, const std::vector precList, const std::string name, const size_t port); + void checkOutput(const SizeVector& dims, const std::vector precList, const std::string name, const size_t port); +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp new file mode 100644 index 00000000000000..584960373aeb2e --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_proposal_node.h" + +static std::vector generate_anchors(proposal_conf &conf) { + auto base_size = conf.base_size_; + auto coordinates_offset = conf.coordinates_offset; + auto round_ratios = conf.round_ratios; + + auto num_ratios = conf.ratios.size(); + auto ratios = conf.ratios.data(); + + auto num_scales = conf.scales.size(); + auto scales = conf.scales.data(); + + std::vector anchors(num_scales * num_ratios * 4); + auto anchors_ptr = anchors.data(); + + // base box's width & height & center location + const float base_area = static_cast(base_size * base_size); + const float half_base_size = base_size * 0.5f; + const float center = 0.5f * (base_size - coordinates_offset); + + // enumerate all transformed boxes + for (int ratio = 0; ratio < num_ratios; ++ratio) { + // transformed width & height for given ratio factors + float ratio_w; + float ratio_h; + if (round_ratios) { + ratio_w = std::roundf(std::sqrt(base_area / ratios[ratio])); + ratio_h = std::roundf(ratio_w * ratios[ratio]); + } else { + ratio_w = std::sqrt(base_area / ratios[ratio]); + ratio_h = ratio_w * ratios[ratio]; + } + + float * const p_anchors_wm = anchors_ptr + 0 * num_ratios * num_scales + ratio * num_scales; + float * const p_anchors_hm = anchors_ptr + 1 * num_ratios * num_scales + ratio * num_scales; + float * const p_anchors_wp = anchors_ptr + 2 * num_ratios * num_scales + ratio * num_scales; + float * const p_anchors_hp = anchors_ptr + 3 * num_ratios * num_scales + ratio * num_scales; + + for (int scale = 0; scale < num_scales; ++scale) { + // transformed width & height for given scale factors + const float scale_w = 0.5f * (ratio_w * scales[scale] - coordinates_offset); + const float scale_h = 0.5f * (ratio_h * scales[scale] - coordinates_offset); + + // (x1, y1, x2, y2) for transformed box + p_anchors_wm[scale] = center - scale_w; + p_anchors_hm[scale] = center - scale_h; + p_anchors_wp[scale] = center + scale_w; + p_anchors_hp[scale] = center + scale_h; + + if (conf.shift_anchors) { + p_anchors_wm[scale] -= half_base_size; + p_anchors_hm[scale] -= half_base_size; + p_anchors_wp[scale] -= half_base_size; + p_anchors_hp[scale] -= half_base_size; + } + } + } + return anchors; +} + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNProposalNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto proposal0Op = ngraph::as_type_ptr(op); + const auto proposal4Op = ngraph::as_type_ptr(op); + if (!proposal0Op && !proposal4Op) { + errorMessage = "Node is not an instance of the Proposal from the operations set v0 or v4."; + return false; + } + auto proposalOp = std::dynamic_pointer_cast(op); + if (proposalOp->get_attrs().framework != "tensorflow" && !proposalOp->get_attrs().framework.empty()) { + errorMessage = "Unsupported framework attribute: " + proposalOp->get_attrs().framework; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNProposalNode::MKLDNNProposalNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + auto proposalOp = std::dynamic_pointer_cast(op); + auto proposalAttrs = proposalOp->get_attrs(); + + conf.feat_stride_ = proposalAttrs.feat_stride; + conf.base_size_ = proposalAttrs.base_size; + conf.min_size_ = proposalAttrs.min_size; + conf.pre_nms_topn_ = proposalAttrs.pre_nms_topn; + conf.post_nms_topn_ = proposalAttrs.post_nms_topn; + conf.nms_thresh_ = proposalAttrs.nms_thresh; + conf.box_coordinate_scale_ = proposalAttrs.box_coordinate_scale; + conf.box_size_scale_ = proposalAttrs.box_size_scale; + conf.scales = proposalAttrs.scale; + conf.ratios = proposalAttrs.ratio; + conf.normalize_ = proposalAttrs.normalize; + conf.clip_before_nms = proposalAttrs.clip_before_nms; + conf.clip_after_nms = proposalAttrs.clip_after_nms; + conf.anchors_shape_0 = conf.ratios.size() * conf.scales.size(); + + if (proposalAttrs.framework == "tensorflow") { + conf.coordinates_offset = 0.0f; + conf.initial_clip = true; + conf.shift_anchors = true; + conf.round_ratios = false; + conf.swap_xy = true; + } else { + conf.coordinates_offset = 1.0f; + conf.initial_clip = false; + conf.shift_anchors = false; + conf.round_ratios = true; + conf.swap_xy = false; + } + + anchors = generate_anchors(conf); + roi_indices.resize(conf.post_nms_topn_); + + store_prob = op->get_output_size() == 2; +} + +void MKLDNNProposalNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + if (store_prob) { + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); + } else { + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); + } +} + +void MKLDNNProposalNode::execute(mkldnn::stream strm) { + try { + const float* probabilitiesData = reinterpret_cast(getParentEdgeAt(PROBABILITIES_IN_IDX)->getMemoryPtr()->GetPtr()); + const float* anchorsData = reinterpret_cast(getParentEdgeAt(ANCHORS_IN_IDX)->getMemoryPtr()->GetPtr()); + const float* imgInfoData = reinterpret_cast(getParentEdgeAt(IMG_INFO_IN_IDX)->getMemoryPtr()->GetPtr()); + float* outRoiData = reinterpret_cast (getChildEdgesAtPort(ROI_OUT_IDX)[0]->getMemoryPtr()->GetPtr()); + float* outProbData = nullptr; + if (store_prob) + outProbData = reinterpret_cast (getChildEdgesAtPort(PROBABILITIES_OUT_IDX)[0]->getMemoryPtr()->GetPtr()); + + auto inProbDims = getParentEdgeAt(0)->getDims().ToSizeVector(); + const size_t imgInfoSize = getParentEdgeAt(2)->getDims()[0]; + + // input image height & width + const float imgHeight = imgInfoData[0]; + const float imgWidth = imgInfoData[1]; + if (!std::isnormal(imgHeight) || !std::isnormal(imgWidth) || (imgHeight < 0.f) || (imgWidth < 0.f)) { + IE_THROW() << "Proposal operation image info input must have positive image height and width."; + } + + // scale factor for height & width + const float scaleHeight = imgInfoData[2]; + const float scaleWidth = imgInfoSize == 4 ? imgInfoData[3] : scaleHeight; + if (!std::isfinite(scaleHeight) || !std::isfinite(scaleWidth) || (scaleHeight < 0.f) || (scaleWidth < 0.f)) { + IE_THROW() << "Proposal operation image info input must have non negative scales."; + } + + InferenceEngine::Extensions::Cpu::XARCH::proposal_exec(probabilitiesData, anchorsData, inProbDims, + {imgHeight, imgWidth, scaleHeight, scaleWidth}, anchors.data(), roi_indices.data(), outRoiData, outProbData, conf); + } catch (const InferenceEngine::Exception& e) { + std::string errorMsg = e.what(); + IE_THROW() << errorMsg; + } +} + +bool MKLDNNProposalNode::created() const { + return getType() == Proposal; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNProposalNode, Proposal) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h new file mode 100644 index 00000000000000..4fdb333b25921b --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_proposal_node.h @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "proposal_imp.hpp" + +using proposal_conf = InferenceEngine::Extensions::Cpu::proposal_conf; + +namespace MKLDNNPlugin { + +class MKLDNNProposalNode : public MKLDNNNode { +public: + MKLDNNProposalNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t PROBABILITIES_IN_IDX = 0lu; + const size_t ANCHORS_IN_IDX = 1lu; + const size_t IMG_INFO_IN_IDX = 2lu; + const size_t ROI_OUT_IDX = 0lu; + const size_t PROBABILITIES_OUT_IDX = 1lu; + + proposal_conf conf; + std::vector anchors; + std::vector roi_indices; + bool store_prob; // store blob with proposal probabilities + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp new file mode 100644 index 00000000000000..33e625fce6f88a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_range_node.h" +#include + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNRangeNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + if (!MKLDNNPlugin::one_of(op->get_type_info(), ngraph::op::v0::Range::type_info, ngraph::op::v4::Range::type_info)) { + errorMessage = "Only opset1 and opset4 Range operation is supported"; + return false; + } + if (std::dynamic_pointer_cast(op->get_input_node_shared_ptr(RANGE_START)) == nullptr || + std::dynamic_pointer_cast(op->get_input_node_shared_ptr(RANGE_LIMIT)) == nullptr || + std::dynamic_pointer_cast(op->get_input_node_shared_ptr(RANGE_DELTA)) == nullptr) { + errorMessage = "Only const inputs for Range operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNRangeNode::MKLDNNRangeNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "Range layer with name '" + op->get_friendly_name() + "'"; + + if (getOriginalInputsNumber() != 3 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + SizeVector start_dims = op->get_input_shape(RANGE_START); + if (ngraph::shape_size(start_dims) != 1) + IE_THROW() << errorPrefix << " has start scalar with more than 1 value"; + + SizeVector limit_dims = op->get_input_shape(RANGE_LIMIT); + if (ngraph::shape_size(limit_dims) != 1) + IE_THROW() << errorPrefix << " has limit scalar with more than 1 value"; + + SizeVector delta_dims = op->get_input_shape(RANGE_DELTA); + if (ngraph::shape_size(delta_dims) != 1) + IE_THROW() << errorPrefix << " has delta scalar with more than 1 value"; + + SizeVector dst_dims = op->get_output_shape(0); + if (dst_dims.size() > 1) + IE_THROW() << errorPrefix << " has unsupported rank for output: " << dst_dims.size(); +} + +void MKLDNNRangeNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector inDataConf; + std::vector outDataConf; + + if (!(getOriginalInputPrecisionAtPort(RANGE_START) == Precision::I32 && + getOriginalInputPrecisionAtPort(RANGE_LIMIT) == Precision::I32 && + getOriginalInputPrecisionAtPort(RANGE_DELTA) == Precision::I32 && + getOriginalOutputPrecisionAtPort(0) == Precision::I32) && + !(getOriginalInputPrecisionAtPort(RANGE_START) == Precision::FP32 && + getOriginalInputPrecisionAtPort(RANGE_LIMIT) == Precision::FP32 && + getOriginalInputPrecisionAtPort(RANGE_DELTA) == Precision::FP32 && + getOriginalOutputPrecisionAtPort(0) == Precision::FP32)) { + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + outDataConf.reserve(1); + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any); + } else { + inDataConf.reserve(getOriginalInputsNumber()); + for (int i = 0; i < getOriginalInputsNumber(); ++i) + inDataConf.emplace_back(TensorDescCreatorTypes::ncsp); + outDataConf.reserve(1); + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp); + addSupportedPrimDesc(inDataConf, outDataConf, impl_desc_type::ref_any); + } +} + +void MKLDNNRangeNode::execute(mkldnn::stream strm) { + StatusCode retcode = OK; + switch (getParentEdgeAt(0)->getDesc().getPrecision()) { + case Precision::FP32: + retcode = rangeKernel(); + break; + case Precision::I32: + retcode = rangeKernel(); + break; + default: + IE_THROW() << "Incorrect output precision. Only FP32 and I32 are supported!"; + } + if (retcode == PARAMETER_MISMATCH) { + std::string errorMsg = "Range indexes exceeds data tensor dimension"; + IE_THROW() << errorMsg; + } +} + +template +InferenceEngine::StatusCode MKLDNNRangeNode::rangeKernel() noexcept { + size_t dst_size = (getChildEdgesAtPort(0)[0]->getDims())[0]; + data_t* dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + data_t start = reinterpret_cast(getParentEdgeAt(RANGE_START)->getMemoryPtr()->GetPtr())[0]; + data_t limit = reinterpret_cast(getParentEdgeAt(RANGE_LIMIT)->getMemoryPtr()->GetPtr())[0]; + data_t delta = reinterpret_cast(getParentEdgeAt(RANGE_DELTA)->getMemoryPtr()->GetPtr())[0]; + size_t work_amount_dst = static_cast(std::floor(std::abs((limit - start) / delta))); + if (work_amount_dst != dst_size) + return PARAMETER_MISMATCH; + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t iwork = 0, end = 0; + splitter(work_amount_dst, nthr, ithr, iwork, end); + data_t dst_value = start + iwork * delta; + + for (; iwork < end; ++iwork, dst_value += delta) { + dst_data[iwork] = dst_value; + } + }); + return OK; +} + +bool MKLDNNRangeNode::created() const { + return getType() == Range; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNRangeNode, Range) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h new file mode 100644 index 00000000000000..b5584be6aa949c --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_range_node.h @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNRangeNode : public MKLDNNNode { +public: + MKLDNNRangeNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + + template + InferenceEngine::StatusCode rangeKernel() noexcept; +private: + static const size_t RANGE_START = 0; + static const size_t RANGE_LIMIT = 1; + static const size_t RANGE_DELTA = 2; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp new file mode 100644 index 00000000000000..3db7470e92fba9 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.cpp @@ -0,0 +1,93 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_reorg_yolo_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNReorgYoloNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto reorgYolo = std::dynamic_pointer_cast(op); + if (!reorgYolo) { + errorMessage = "Only opset2 ReorgYolo operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNReorgYoloNode::MKLDNNReorgYoloNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = std::string(op->get_type_name()) + " node with name '" + op->get_friendly_name() + "'"; + if (getOriginalInputsNumber() != 1 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + const auto reorgYolo = std::dynamic_pointer_cast(op); + const auto strides = reorgYolo->get_strides(); + if (strides.empty()) + IE_THROW() << errorPrefix << " has empty strides"; + stride = strides[0]; +} + +void MKLDNNReorgYoloNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNReorgYoloNode::execute(mkldnn::stream strm) { + const auto *src_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + auto *dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + int IW = (getParentEdgeAt(0)->getDesc().getDims().size() > 3) ? getParentEdgeAt(0)->getDims()[3] : 1; + int IH = (getParentEdgeAt(0)->getDesc().getDims().size() > 2) ? getParentEdgeAt(0)->getDims()[2] : 1; + int IC = (getParentEdgeAt(0)->getDesc().getDims().size() > 1) ? getParentEdgeAt(0)->getDims()[1] : 1; + int B = (getParentEdgeAt(0)->getDesc().getDims().size() > 0) ? getParentEdgeAt(0)->getDims()[0] : 1; + + int ic_off = IC / (stride * stride); + int ih_off = IH * stride; + int iw_off = IW * stride; + for (int b = 0; b < B; b++) { + for (int ic = 0; ic < IC; ic++) { + for (int ih = 0; ih < IH; ih++) { + for (int iw = 0; iw < IW; iw++) { + int dstIndex = b * IC * IH * IW + ic * IH * IW + ih * IW + iw; + + int oc = ic % ic_off; + int offset = ic / ic_off; + + int ow = iw * stride + offset % stride; + int oh = ih * stride + offset / stride; + + int srcIndex = b * ic_off * ih_off * iw_off + oc * ih_off * iw_off + oh * iw_off + ow; + + dst_data[dstIndex] = src_data[srcIndex]; + } + } + } + } +} + +bool MKLDNNReorgYoloNode::created() const { + return getType() == ReorgYolo; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNReorgYoloNode, ReorgYolo) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h new file mode 100644 index 00000000000000..b88f19010e0491 --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorg_yolo_node.h @@ -0,0 +1,30 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNReorgYoloNode : public MKLDNNNode { +public: + MKLDNNReorgYoloNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + int stride; + + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp new file mode 100644 index 00000000000000..5f6e6083e90c4a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.cpp @@ -0,0 +1,182 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "base.hpp" + +#include +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_reverse_sequence_node.h" + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNReverseSequenceNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto revSeq = std::dynamic_pointer_cast(op); + if (!revSeq) { + errorMessage = "Only opset1 ReverseSequence operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNReverseSequenceNode::MKLDNNReverseSequenceNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + errorPrefix = "ReverseSequence layer with name '" + op->get_friendly_name() + "'"; + const auto revSeq = std::dynamic_pointer_cast(op); + + if (getOriginalInputsNumber() != 2 || getOriginalOutputsNumber() != 1) + IE_THROW() << errorPrefix << " has incorrect number of input/output edges!"; + + src_dims = op->get_input_shape(REVERSESEQUENCE_DATA); + + SizeVector seq_lengths_dims = op->get_input_shape(REVERSESEQUENCE_LENGTHS); + if (seq_lengths_dims.size() != 1) + IE_THROW() << errorPrefix << " has incorrect 2nd input rank: " << seq_lengths_dims.size(); + + SizeVector dst_dims = op->get_output_shape(0); + if (src_dims.size() != dst_dims.size()) + IE_THROW() << errorPrefix << " has incorrect number of input/output sizes!"; + + for (size_t i = 0; i < dst_dims.size(); i++) { + if (src_dims[i] != dst_dims[i]) + IE_THROW() << errorPrefix << " has incorrect number of input/output dimension!"; + } + + seq_axis = revSeq->get_sequence_axis(); + + if (seq_axis < 0 || seq_axis >= static_cast(src_dims.size())) + IE_THROW() << errorPrefix << " has incorrect 'seq_axis' parameters dimensions and axis number!"; + + batch_axis = revSeq->get_batch_axis(); + + if (batch_axis < 0 || batch_axis >= static_cast(src_dims.size())) + IE_THROW() << errorPrefix << " has incorrect 'batch_axis' parameters dimensions and axis number!"; + + if (seq_lengths_dims[0] != dst_dims[batch_axis]) + IE_THROW() << errorPrefix << " has incorrect 'seq_lengths_dims' parameters dimension!"; + + srcStrides.resize(src_dims.size()); + srcStrides[srcStrides.size() - 1] = 1; + for (int i = srcStrides.size() - 2; i >= 0; i--) { + srcStrides[i] = srcStrides[i + 1] * src_dims[i + 1]; + } + + work_amount_dst = srcStrides[0] * src_dims[0]; +} + +void MKLDNNReverseSequenceNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + lengthsPrecision = getOriginalInputPrecisionAtPort(REVERSESEQUENCE_LENGTHS); + if (lengthsPrecision != Precision::I32 && lengthsPrecision != Precision::FP32) + lengthsPrecision = Precision::I32; + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, lengthsPrecision}}, + {{TensorDescCreatorTypes::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNReverseSequenceNode::execute(mkldnn::stream strm) { + size_t i; + const float *src_data = reinterpret_cast(getParentEdgeAt(REVERSESEQUENCE_DATA)->getMemoryPtr()->GetPtr()); + float* dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + + switch (getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getDesc().getPrecision()) { + case Precision::FP32: { + float *seq_lengths_data = reinterpret_cast(getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getMemoryPtr()->GetPtr()); + for (i = 0; i < src_dims[batch_axis]; i++) { + if (static_cast(seq_lengths_data[i]) > static_cast(src_dims[seq_axis])) { + std::string errorMsg = "Incorrect input 'seq_lengths' values!"; + IE_THROW() << errorMsg; + } + } + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0, src_idx = 0; + SizeVector counters(src_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = src_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % src_dims[j]; + i /= src_dims[j]; + } + + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < src_dims.size(); ++i) { + size_t idx = counters[i]; + if (static_cast(i) == seq_axis && + static_cast(idx) < static_cast(seq_lengths_data[counters[batch_axis]])) { + idx = static_cast(seq_lengths_data[counters[batch_axis]]) - idx - 1; + } + src_idx += idx * srcStrides[i]; + } + dst_data[iwork] = src_data[src_idx]; + for (int j = src_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % src_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + case Precision::I32: { + int32_t *seq_lengths_data = reinterpret_cast(getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getMemoryPtr()->GetPtr()); + for (i = 0; i < src_dims[batch_axis]; i++) { + if (seq_lengths_data[i] > static_cast(src_dims[seq_axis])) { + std::string errorMsg = "Incorrect input 'seq_lengths' values!"; + IE_THROW() << errorMsg; + } + } + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t i, start = 0, end = 0, src_idx = 0; + SizeVector counters(src_dims.size(), 0); + splitter(work_amount_dst, nthr, ithr, start, end); + for (int j = src_dims.size() - 1, i = start; j >= 0; j--) { + counters[j] = i % src_dims[j]; + i /= src_dims[j]; + } + + for (size_t iwork = start; iwork < end; ++iwork) { + for (i = 0, src_idx = 0; i < src_dims.size(); ++i) { + size_t idx = counters[i]; + if (static_cast(i) == seq_axis && + static_cast(idx) < seq_lengths_data[counters[batch_axis]]) { + idx = seq_lengths_data[counters[batch_axis]] - idx - 1; + } + src_idx += idx * srcStrides[i]; + } + dst_data[iwork] = src_data[src_idx]; + for (int j = src_dims.size() - 1; j >= 0; j--) { + counters[j] = (counters[j] + 1) % src_dims[j]; + if (counters[j] != 0) break; + } + } + }); + } + break; + default: + IE_THROW() << "ReverseSequence layer does not support " + << getParentEdgeAt(REVERSESEQUENCE_LENGTHS)->getDesc().getPrecision() << " precision"; + } +} + +bool MKLDNNReverseSequenceNode::created() const { + return getType() == ReverseSequence; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNReverseSequenceNode, ReverseSequence) diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h new file mode 100644 index 00000000000000..4b3cf056c63afa --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reverse_sequence_node.h @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNReverseSequenceNode : public MKLDNNNode { +public: + MKLDNNReverseSequenceNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override {}; + void execute(mkldnn::stream strm) override; + bool created() const override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + const size_t REVERSESEQUENCE_DATA = 0; + const size_t REVERSESEQUENCE_LENGTHS = 1; + + int seq_axis; + int batch_axis; + InferenceEngine::SizeVector src_dims; + InferenceEngine::SizeVector srcStrides; + size_t work_amount_dst; + + InferenceEngine::Precision lengthsPrecision; + std::string errorPrefix; +}; + +} // namespace MKLDNNPlugin diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp index 77db762169254e..a1a7f8329a5c52 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp @@ -88,8 +88,7 @@ struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, publi this->postamble(); load_emitter->emit_data(); - if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core) && store_emitter != nullptr && store_emitter->get_emu_vcvtneps2bf16() != nullptr) - store_emitter->get_emu_vcvtneps2bf16()->emit_data(); + store_emitter->emit_data(); } private: @@ -155,7 +154,7 @@ struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, publi Vmm vmm_max = get_acc_reg(i); load_emitter->emit_code({static_cast(reg_input.getIdx())}, {static_cast(vmm_max.getIdx())}, - std::make_shared(jpp_.src_prc, Precision::FP32, step, false, "zero", i * src_c_off), + std::make_shared(jpp_.src_prc, Precision::FP32, step, i * src_c_off), {}, load_pool_gpr_idxs); } @@ -169,7 +168,7 @@ struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, publi Vmm vmm_src = get_src_reg(i); load_emitter->emit_code({static_cast(aux_reg_input1.getIdx())}, {static_cast(vmm_src.getIdx())}, - std::make_shared(jpp_.src_prc, Precision::FP32, step, false, "zero", i * src_c_off), + std::make_shared(jpp_.src_prc, Precision::FP32, step, i * src_c_off), {}, load_pool_gpr_idxs); if (isa == cpu::x64::sse41) { @@ -222,7 +221,7 @@ struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, publi for (int i = 0; i < c_blocks; i++) { const int src_c_off = i * jpp_.ih * jpp_.iw * jpp_.c_block * jpp_.src_data_size; - const auto load_context = std::make_shared(jpp_.src_prc, Precision::FP32, step, false, "zero", src_c_off); + const auto load_context = std::make_shared(jpp_.src_prc, Precision::FP32, step, src_c_off); mov(aux_reg_input, reg_input); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp index 6d4c9a27dc4d8b..53dda785e69115 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp @@ -14,7 +14,7 @@ using namespace InferenceEngine; MKLDNNSoftMaxNode::MKLDNNSoftMaxNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { - auto softmaxOp = ngraph::as_type_ptr(op); + const auto softmaxOp = ngraph::as_type_ptr(op); if (softmaxOp) { axis = softmaxOp->get_axis(); } else { diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp index 07cc72247a5eff..1b70de9f0f8341 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp @@ -86,7 +86,7 @@ MKLDNNStridedSliceNode::MKLDNNStridedSliceNode(const std::shared_ptrisConstant() && node->getType() == Input; + return node->getType() == Input && node->isConstant(); }; params.parametersAreConstant = isConstantNode(getParentEdgesAtPort(BEGIN_ID)[0]->getParent()) && @@ -138,7 +138,11 @@ void MKLDNNStridedSliceNode::getSupportedDescriptors() { if (params.parametersAreConstant) { auto fillingInParameters = [&](std::vector ¶meter, const size_t type, const size_t size, const int value) { - auto blob = std::dynamic_pointer_cast(getParentEdgesAtPort(type)[0]->getParent())->getMemoryPtr(); + const auto constNode = std::dynamic_pointer_cast(getParentEdgesAtPort(type)[0]->getParent()); + if (!constNode) { + THROW_ERROR << "can't cast node on " << type << " port to MKLDNNInputNode"; + } + auto blob = constNode->getMemoryPtr(); if (blob->GetDataType() != mkldnn::memory::data_type::s32) THROW_ERROR << "supports only parameters input with precision I32"; const int *ptr = static_cast(blob->GetPtr()); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp index c9a53c79e07865..d1d80e1b7cba7b 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp @@ -136,6 +136,9 @@ class IterCountPortHelper : public PortMapHelper { void execute(mkldnn::stream strm, int n_iter) override { auto mem = mem_holder_dst; auto data_ptr = static_cast(mem.get_data_handle()); + if (data_ptr == nullptr) { + IE_THROW() << "TensorIterator node has not allocated memory for IterCountPortHelper"; + } *data_ptr = n_iter; } }; @@ -150,6 +153,9 @@ class asBoolCheck : public PortChecker { int getStatus() override { auto data_ptr = static_cast(mem_holder.get_data_handle()); + if (data_ptr == nullptr) { + IE_THROW() << "TensorIterator node has not allocated memory for asBoolCheck"; + } return *data_ptr == static_cast(0) ? 0 : 1; } }; @@ -164,6 +170,9 @@ class asIntCheck : public PortChecker { int getStatus() override { auto data_ptr = static_cast(mem_holder.get_data_handle()); + if (data_ptr == nullptr) { + IE_THROW() << "TensorIterator node has not allocated memory for asIntCheck"; + } return *data_ptr; } }; @@ -283,6 +292,9 @@ MKLDNNTensorIteratorNode::MKLDNNTensorIteratorNode(const std::shared_ptr(ngraphOp); + if (tiOp == nullptr) { + IE_THROW() << "Can't cast TensorIterator node with name: " << getName() << " to ngraph::op::util::SubGraphOp"; + } const std::shared_ptr body = tiOp->get_function(); sub_graph.CreateGraph(body, ext_mng, weightCache); diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp new file mode 100644 index 00000000000000..1c78c44b48df5a --- /dev/null +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_topk_node.cpp @@ -0,0 +1,478 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include "ie_parallel.hpp" +#include "mkldnn_topk_node.h" +#include "utils/general_utils.h" + +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) +#include +#endif + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNTopKNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto topKOp = ngraph::as_type_ptr(op); + if (!topKOp) { + errorMessage = "Node is not an instance of the TopK from the operations set v1 or v3"; + return false; + } + if (topKOp->get_mode() != ngraph::op::TopKMode::MAX && + topKOp->get_mode() != ngraph::op::TopKMode::MIN) { + errorMessage = "Unsupported mode."; + return false; + } + if (!MKLDNNPlugin::one_of(topKOp->get_sort_type(), ngraph::op::TopKSortType::NONE, + ngraph::op::TopKSortType::SORT_VALUES, + ngraph::op::TopKSortType::SORT_INDICES)) { + errorMessage = "Unsupported sort type."; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNTopKNode::MKLDNNTopKNode(const std::shared_ptr& op, const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + auto topK1Op = ngraph::as_type_ptr(op); + + SizeVector dstDims = topK1Op->get_output_shape(TOPK_VALUE); + src_dims = topK1Op->get_input_shape(TOPK_DATA); + + axis = topK1Op->get_axis(); + + if (topK1Op->get_mode() == ngraph::op::TopKMode::MAX) + mode_max = true; + else + mode_max = false; + + if (topK1Op->get_sort_type() == ngraph::op::TopKSortType::SORT_VALUES) + sort_value = true; + else + sort_value = false; + + int j; + for (j = src_dims.size() - 1; j >= 0; j--) { + if (src_dims[j] != 1) break; + } + if (static_cast(j) == axis) is_last_dim = true; + + for (size_t i = 0; i < axis; i++) { + axis_step *= src_dims[i]; + } + axis_dim = src_dims[axis]; + for (size_t i = (axis + 1); i < src_dims.size(); i++) { + axis_stride *= src_dims[i]; + } + dim = static_cast(src_dims[axis]); + before_num = count(src_dims, 0, axis); +} + +void MKLDNNTopKNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + std::vector outDataConf; + outDataConf.reserve(getOriginalOutputsNumber()); + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::FP32); + for (int i = 1; i < getOriginalOutputsNumber(); ++i) + outDataConf.emplace_back(TensorDescCreatorTypes::ncsp, Precision::I32); + + addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, Precision::FP32}, + {TensorDescCreatorTypes::ncsp, Precision::I32}}, + outDataConf, + impl_desc_type::ref_any); +} + +void MKLDNNTopKNode::execute(mkldnn::stream strm) { + const float *src = reinterpret_cast(getParentEdgeAt(TOPK_DATA)->getMemoryPtr()->GetPtr()); + src_k = reinterpret_cast(getParentEdgeAt(TOPK_K)->getMemoryPtr()->GetPtr())[0]; + float* dst_data = nullptr; + int* dst_idx = nullptr; + + if (outDims.size() == 1) { + if (getOriginalOutputPrecisionAtPort(0) == Precision::FP32) { + dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + } else { + dst_idx = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); + } + SizeVector dstDims = getChildEdgesAtPort(0)[0]->getDims().ToSizeVector(); + + if (dstDims[axis] != static_cast(src_k)) { + std::string errorMsg = "Output tensor dimension mismatch"; + IE_THROW() << errorMsg; + } + } else if (outDims.size() == 2) { + dst_data = reinterpret_cast(getChildEdgesAtPort(TOPK_VALUE)[0]->getMemoryPtr()->GetPtr()); + SizeVector dst_data_dims = getChildEdgesAtPort(TOPK_VALUE)[0]->getDims().ToSizeVector(); + + dst_idx = reinterpret_cast(getChildEdgesAtPort(TOPK_INDEX)[0]->getMemoryPtr()->GetPtr()); + SizeVector dst_idx_dims = getChildEdgesAtPort(TOPK_INDEX)[0]->getDims().ToSizeVector(); + + if (dst_idx_dims[axis] != static_cast(src_k) || dst_data_dims[axis] != static_cast(src_k)) { + std::string errorMsg = "Output tensors dimension mismatch"; + IE_THROW() << errorMsg; + } + } else { + std::string errorMsg = "Output tensors amount mismatch"; + IE_THROW() << errorMsg; + } + + if (src_dims[axis] < static_cast(src_k)) + src_k = src_dims[axis]; + + SizeVector in_dims = getParentEdgeAt(TOPK_DATA)->getDims().ToSizeVector(); + + if (src_k == 1) { + if (is_last_dim) { + if (mode_max) + top1(src, dst_data, dst_idx, in_dims); + else + top1(src, dst_data, dst_idx, in_dims); + } else { + if (mode_max) + top1_axis(src, dst_data, dst_idx, in_dims); + else + top1_axis(src, dst_data, dst_idx, in_dims); + } + } else { + if (is_last_dim) { + if (mode_max) + topk(src, dst_data, dst_idx, in_dims); + else + topk(src, dst_data, dst_idx, in_dims); + } else { + if (mode_max) + topk_axis(src, dst_data, dst_idx, in_dims); + else + topk_axis(src, dst_data, dst_idx, in_dims); + } + } +} + +bool MKLDNNTopKNode::created() const { + return getType() == TopK; +} + +template class Compare2> +void MKLDNNTopKNode::top1_axis(const float* src_data, float* dst_data, int* dst_idx, SizeVector in_dims) { + int after_num = count(in_dims, axis + 1, in_dims.size()); + int first_index = 0; + +#if defined(HAVE_SSE) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) + parallel_for2d(before_num, after_num / block_size, [&](int i0, int ib1) { + int s_index = i0 * dim * after_num + ib1 * block_size; + vec_type_f vmax_val = _mm_uni_loadu_ps(src_data + s_index); + vec_type_i vindex_max_val = _mm_uni_setzero_si(); + for (int i2 = 1; i2 < dim; i2++) { + s_index += after_num; + vec_type_f vsrc = _mm_uni_loadu_ps(src_data + s_index); + vmask_type vmask = Compare1::cmp_ps(vsrc, vmax_val); + vmax_val = _mm_uni_blendv_ps(vmax_val, vsrc, vmask); + + vec_type_i vindex_cur_val = _mm_uni_set1_epi32(i2); +#if defined(HAVE_AVX512F) + vindex_max_val = _mm512_mask_blend_epi32(vmask, vindex_max_val, vindex_cur_val); +#else + vindex_max_val = _mm_uni_blendv_epi8(vindex_max_val, vindex_cur_val, _mm_uni_castps_si(vmask)); +#endif + } + if (dst_data) + _mm_uni_storeu_ps(dst_data + i0 * after_num + ib1 * block_size, vmax_val); + if (dst_idx) + _mm_uni_storeu_si(reinterpret_cast(dst_idx + i0 * after_num + ib1 * block_size), vindex_max_val); + }); + first_index = after_num / block_size * block_size; +#endif + int rest = after_num - first_index; + parallel_for2d(before_num, rest, [&](int i0, int i1) { + int index_max_val = 0; + int s_index = i0 * dim * after_num + first_index + i1; + float max_val = src_data[s_index]; + for (int i2 = 1; i2 < dim; i2++) { + s_index += after_num; + if (Compare2()(src_data[s_index], max_val)) { + max_val = src_data[s_index]; + index_max_val = i2; + } + } + if (dst_data) + dst_data[i0 * after_num + first_index + i1] = max_val; + if (dst_idx) + dst_idx[i0 * after_num + first_index + i1] = index_max_val; + }); +} + +template